LLVM 20.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382
383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
396 MVT::f32, Legal);
397
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
401 {MVT::f16, MVT::f32, MVT::f64}, Expand);
402
405 Custom);
406
407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412 Expand);
413
414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
415
416 if (Subtarget->has16BitInsts())
417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
418 else {
419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
421 }
422
424 Custom);
425
426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428 // default unless marked custom/legal.
431 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
432 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
433 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
434 Custom);
435
436 // Expand to fneg + fadd.
438
440 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
441 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
449 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
450 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
451 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
452 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
453 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
454 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
455 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
456 Custom);
457
459 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
460
461 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
462 for (MVT VT : ScalarIntVTs) {
463 // These should use [SU]DIVREM, so set them to expand
465 Expand);
466
467 // GPU does not have divrem function for signed or unsigned.
469
470 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
472
474
475 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
477 }
478
479 // The hardware supports 32-bit FSHR, but not FSHL.
481
482 // The hardware supports 32-bit ROTR, but not ROTL.
483 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
485
487
491 MVT::i64, Custom);
493
495 Legal);
496
499 MVT::i64, Custom);
500
501 for (auto VT : {MVT::i8, MVT::i16})
503
504 static const MVT::SimpleValueType VectorIntTypes[] = {
505 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
506 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
507
508 for (MVT VT : VectorIntTypes) {
509 // Expand the following operations for the current type by default.
521 ISD::SETCC},
522 VT, Expand);
523 }
524
525 static const MVT::SimpleValueType FloatVectorTypes[] = {
526 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
527 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
528
529 for (MVT VT : FloatVectorTypes) {
542 VT, Expand);
543 }
544
545 // This causes using an unrolled select operation rather than expansion with
546 // bit operations. This is in general better, but the alternative using BFI
547 // instructions may be better if the select sources are SGPRs.
549 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
550
552 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
553
555 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
556
558 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
559
561 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
562
564 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
565
567 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
568
570 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
571
573 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
574
576 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
577
579 setJumpIsExpensive(true);
580
581 // FIXME: This is only partially true. If we have to do vector compares, any
582 // SGPR pair can be a condition register. If we have a uniform condition, we
583 // are better off doing SALU operations, where there is only one SCC. For now,
584 // we don't have a way of knowing during instruction selection if a condition
585 // will be uniform and we always use vector compares. Assume we are using
586 // vector compares until that is fixed.
588
591
593
594 // We want to find all load dependencies for long chains of stores to enable
595 // merging into very wide vectors. The problem is with vectors with > 4
596 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
597 // vectors are a legal type, even though we have to split the loads
598 // usually. When we can more precisely specify load legality per address
599 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
600 // smarter so that they can figure out what to do in 2 iterations without all
601 // N > 4 stores on the same chain.
603
604 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
605 // about these during lowering.
606 MaxStoresPerMemcpy = 0xffffffff;
607 MaxStoresPerMemmove = 0xffffffff;
608 MaxStoresPerMemset = 0xffffffff;
609
610 // The expansion for 64-bit division is enormous.
612 addBypassSlowDiv(64, 32);
613
624
628}
629
631 if (getTargetMachine().Options.NoSignedZerosFPMath)
632 return true;
633
634 const auto Flags = Op.getNode()->getFlags();
635 if (Flags.hasNoSignedZeros())
636 return true;
637
638 return false;
639}
640
641//===----------------------------------------------------------------------===//
642// Target Information
643//===----------------------------------------------------------------------===//
644
646static bool fnegFoldsIntoOpcode(unsigned Opc) {
647 switch (Opc) {
648 case ISD::FADD:
649 case ISD::FSUB:
650 case ISD::FMUL:
651 case ISD::FMA:
652 case ISD::FMAD:
653 case ISD::FMINNUM:
654 case ISD::FMAXNUM:
657 case ISD::FMINIMUM:
658 case ISD::FMAXIMUM:
659 case ISD::SELECT:
660 case ISD::FSIN:
661 case ISD::FTRUNC:
662 case ISD::FRINT:
663 case ISD::FNEARBYINT:
664 case ISD::FROUNDEVEN:
666 case AMDGPUISD::RCP:
673 case AMDGPUISD::FMED3:
674 // TODO: handle llvm.amdgcn.fma.legacy
675 return true;
676 case ISD::BITCAST:
677 llvm_unreachable("bitcast is special cased");
678 default:
679 return false;
680 }
681}
682
683static bool fnegFoldsIntoOp(const SDNode *N) {
684 unsigned Opc = N->getOpcode();
685 if (Opc == ISD::BITCAST) {
686 // TODO: Is there a benefit to checking the conditions performFNegCombine
687 // does? We don't for the other cases.
688 SDValue BCSrc = N->getOperand(0);
689 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
690 return BCSrc.getNumOperands() == 2 &&
691 BCSrc.getOperand(1).getValueSizeInBits() == 32;
692 }
693
694 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
695 }
696
697 return fnegFoldsIntoOpcode(Opc);
698}
699
700/// \p returns true if the operation will definitely need to use a 64-bit
701/// encoding, and thus will use a VOP3 encoding regardless of the source
702/// modifiers.
704static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
705 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
706 VT == MVT::f64;
707}
708
709/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
710/// type for ISD::SELECT.
712static bool selectSupportsSourceMods(const SDNode *N) {
713 // TODO: Only applies if select will be vector
714 return N->getValueType(0) == MVT::f32;
715}
716
717// Most FP instructions support source modifiers, but this could be refined
718// slightly.
720static bool hasSourceMods(const SDNode *N) {
721 if (isa<MemSDNode>(N))
722 return false;
723
724 switch (N->getOpcode()) {
725 case ISD::CopyToReg:
726 case ISD::FDIV:
727 case ISD::FREM:
728 case ISD::INLINEASM:
732
733 // TODO: Should really be looking at the users of the bitcast. These are
734 // problematic because bitcasts are used to legalize all stores to integer
735 // types.
736 case ISD::BITCAST:
737 return false;
739 switch (N->getConstantOperandVal(0)) {
740 case Intrinsic::amdgcn_interp_p1:
741 case Intrinsic::amdgcn_interp_p2:
742 case Intrinsic::amdgcn_interp_mov:
743 case Intrinsic::amdgcn_interp_p1_f16:
744 case Intrinsic::amdgcn_interp_p2_f16:
745 return false;
746 default:
747 return true;
748 }
749 }
750 case ISD::SELECT:
752 default:
753 return true;
754 }
755}
756
758 unsigned CostThreshold) {
759 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
760 // it is truly free to use a source modifier in all cases. If there are
761 // multiple users but for each one will necessitate using VOP3, there will be
762 // a code size increase. Try to avoid increasing code size unless we know it
763 // will save on the instruction count.
764 unsigned NumMayIncreaseSize = 0;
765 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
766
767 assert(!N->use_empty());
768
769 // XXX - Should this limit number of uses to check?
770 for (const SDNode *U : N->uses()) {
771 if (!hasSourceMods(U))
772 return false;
773
774 if (!opMustUseVOP3Encoding(U, VT)) {
775 if (++NumMayIncreaseSize > CostThreshold)
776 return false;
777 }
778 }
779
780 return true;
781}
782
784 ISD::NodeType ExtendKind) const {
785 assert(!VT.isVector() && "only scalar expected");
786
787 // Round to the next multiple of 32-bits.
788 unsigned Size = VT.getSizeInBits();
789 if (Size <= 32)
790 return MVT::i32;
791 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
792}
793
795 return MVT::i32;
796}
797
799 return true;
800}
801
802// The backend supports 32 and 64 bit floating point immediates.
803// FIXME: Why are we reporting vectors of FP immediates as legal?
805 bool ForCodeSize) const {
806 EVT ScalarVT = VT.getScalarType();
807 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
808 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
809}
810
811// We don't want to shrink f64 / f32 constants.
813 EVT ScalarVT = VT.getScalarType();
814 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
815}
816
818 ISD::LoadExtType ExtTy,
819 EVT NewVT) const {
820 // TODO: This may be worth removing. Check regression tests for diffs.
822 return false;
823
824 unsigned NewSize = NewVT.getStoreSizeInBits();
825
826 // If we are reducing to a 32-bit load or a smaller multi-dword load,
827 // this is always better.
828 if (NewSize >= 32)
829 return true;
830
831 EVT OldVT = N->getValueType(0);
832 unsigned OldSize = OldVT.getStoreSizeInBits();
833
834 MemSDNode *MN = cast<MemSDNode>(N);
835 unsigned AS = MN->getAddressSpace();
836 // Do not shrink an aligned scalar load to sub-dword.
837 // Scalar engine cannot do sub-dword loads.
838 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
839 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
842 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
843 MN->isInvariant())) &&
845 return false;
846
847 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
848 // extloads, so doing one requires using a buffer_load. In cases where we
849 // still couldn't use a scalar load, using the wider load shouldn't really
850 // hurt anything.
851
852 // If the old size already had to be an extload, there's no harm in continuing
853 // to reduce the width.
854 return (OldSize < 32);
855}
856
858 const SelectionDAG &DAG,
859 const MachineMemOperand &MMO) const {
860
861 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
862
863 if (LoadTy.getScalarType() == MVT::i32)
864 return false;
865
866 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
867 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
868
869 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
870 return false;
871
872 unsigned Fast = 0;
874 CastTy, MMO, &Fast) &&
875 Fast;
876}
877
878// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
879// profitable with the expansion for 64-bit since it's generally good to
880// speculate things.
882 return true;
883}
884
886 return true;
887}
888
890 switch (N->getOpcode()) {
891 case ISD::EntryToken:
892 case ISD::TokenFactor:
893 return true;
895 unsigned IntrID = N->getConstantOperandVal(0);
897 }
898 case ISD::LOAD:
899 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
901 return true;
902 return false;
903 case AMDGPUISD::SETCC: // ballot-style instruction
904 return true;
905 }
906 return false;
907}
908
910 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
911 NegatibleCost &Cost, unsigned Depth) const {
912
913 switch (Op.getOpcode()) {
914 case ISD::FMA:
915 case ISD::FMAD: {
916 // Negating a fma is not free if it has users without source mods.
917 if (!allUsesHaveSourceMods(Op.getNode()))
918 return SDValue();
919 break;
920 }
921 case AMDGPUISD::RCP: {
922 SDValue Src = Op.getOperand(0);
923 EVT VT = Op.getValueType();
924 SDLoc SL(Op);
925
926 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
927 ForCodeSize, Cost, Depth + 1);
928 if (NegSrc)
929 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
930 return SDValue();
931 }
932 default:
933 break;
934 }
935
936 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
937 ForCodeSize, Cost, Depth);
938}
939
940//===---------------------------------------------------------------------===//
941// Target Properties
942//===---------------------------------------------------------------------===//
943
946
947 // Packed operations do not have a fabs modifier.
948 return VT == MVT::f32 || VT == MVT::f64 ||
949 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
950}
951
954 // Report this based on the end legalized type.
955 VT = VT.getScalarType();
956 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
957}
958
960 unsigned NumElem,
961 unsigned AS) const {
962 return true;
963}
964
966 // There are few operations which truly have vector input operands. Any vector
967 // operation is going to involve operations on each component, and a
968 // build_vector will be a copy per element, so it always makes sense to use a
969 // build_vector input in place of the extracted element to avoid a copy into a
970 // super register.
971 //
972 // We should probably only do this if all users are extracts only, but this
973 // should be the common case.
974 return true;
975}
976
978 // Truncate is just accessing a subregister.
979
980 unsigned SrcSize = Source.getSizeInBits();
981 unsigned DestSize = Dest.getSizeInBits();
982
983 return DestSize < SrcSize && DestSize % 32 == 0 ;
984}
985
987 // Truncate is just accessing a subregister.
988
989 unsigned SrcSize = Source->getScalarSizeInBits();
990 unsigned DestSize = Dest->getScalarSizeInBits();
991
992 if (DestSize== 16 && Subtarget->has16BitInsts())
993 return SrcSize >= 32;
994
995 return DestSize < SrcSize && DestSize % 32 == 0;
996}
997
999 unsigned SrcSize = Src->getScalarSizeInBits();
1000 unsigned DestSize = Dest->getScalarSizeInBits();
1001
1002 if (SrcSize == 16 && Subtarget->has16BitInsts())
1003 return DestSize >= 32;
1004
1005 return SrcSize == 32 && DestSize == 64;
1006}
1007
1009 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1010 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1011 // this will enable reducing 64-bit operations the 32-bit, which is always
1012 // good.
1013
1014 if (Src == MVT::i16)
1015 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1016
1017 return Src == MVT::i32 && Dest == MVT::i64;
1018}
1019
1021 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1022 // limited number of native 64-bit operations. Shrinking an operation to fit
1023 // in a single 32-bit register should always be helpful. As currently used,
1024 // this is much less general than the name suggests, and is only used in
1025 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1026 // not profitable, and may actually be harmful.
1027 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1028}
1029
1031 const SDNode* N, CombineLevel Level) const {
1032 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1033 N->getOpcode() == ISD::SRL) &&
1034 "Expected shift op");
1035 // Always commute pre-type legalization and right shifts.
1036 // We're looking for shl(or(x,y),z) patterns.
1038 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1039 return true;
1040
1041 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1042 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1043 (N->use_begin()->getOpcode() == ISD::SRA ||
1044 N->use_begin()->getOpcode() == ISD::SRL))
1045 return false;
1046
1047 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1048 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1049 if (LHS.getOpcode() != ISD::SHL)
1050 return false;
1051 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1052 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1053 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1054 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1055 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1056 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1057 };
1058 SDValue LHS = N->getOperand(0).getOperand(0);
1059 SDValue RHS = N->getOperand(0).getOperand(1);
1060 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1061}
1062
1063//===---------------------------------------------------------------------===//
1064// TargetLowering Callbacks
1065//===---------------------------------------------------------------------===//
1066
1068 bool IsVarArg) {
1069 switch (CC) {
1077 return CC_AMDGPU;
1080 return CC_AMDGPU_CS_CHAIN;
1081 case CallingConv::C:
1082 case CallingConv::Fast:
1083 case CallingConv::Cold:
1084 return CC_AMDGPU_Func;
1086 return CC_SI_Gfx;
1089 default:
1090 report_fatal_error("Unsupported calling convention for call");
1091 }
1092}
1093
1095 bool IsVarArg) {
1096 switch (CC) {
1099 llvm_unreachable("kernels should not be handled here");
1109 return RetCC_SI_Shader;
1111 return RetCC_SI_Gfx;
1112 case CallingConv::C:
1113 case CallingConv::Fast:
1114 case CallingConv::Cold:
1115 return RetCC_AMDGPU_Func;
1116 default:
1117 report_fatal_error("Unsupported calling convention.");
1118 }
1119}
1120
1121/// The SelectionDAGBuilder will automatically promote function arguments
1122/// with illegal types. However, this does not work for the AMDGPU targets
1123/// since the function arguments are stored in memory as these illegal types.
1124/// In order to handle this properly we need to get the original types sizes
1125/// from the LLVM IR Function and fixup the ISD:InputArg values before
1126/// passing them to AnalyzeFormalArguments()
1127
1128/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1129/// input values across multiple registers. Each item in the Ins array
1130/// represents a single value that will be stored in registers. Ins[x].VT is
1131/// the value type of the value that will be stored in the register, so
1132/// whatever SDNode we lower the argument to needs to be this type.
1133///
1134/// In order to correctly lower the arguments we need to know the size of each
1135/// argument. Since Ins[x].VT gives us the size of the register that will
1136/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1137/// for the original function argument so that we can deduce the correct memory
1138/// type to use for Ins[x]. In most cases the correct memory type will be
1139/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1140/// we have a kernel argument of type v8i8, this argument will be split into
1141/// 8 parts and each part will be represented by its own item in the Ins array.
1142/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1143/// the argument before it was split. From this, we deduce that the memory type
1144/// for each individual part is i8. We pass the memory type as LocVT to the
1145/// calling convention analysis function and the register type (Ins[x].VT) as
1146/// the ValVT.
1148 CCState &State,
1149 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1150 const MachineFunction &MF = State.getMachineFunction();
1151 const Function &Fn = MF.getFunction();
1152 LLVMContext &Ctx = Fn.getParent()->getContext();
1153 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1154 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1156
1157 Align MaxAlign = Align(1);
1158 uint64_t ExplicitArgOffset = 0;
1159 const DataLayout &DL = Fn.getDataLayout();
1160
1161 unsigned InIndex = 0;
1162
1163 for (const Argument &Arg : Fn.args()) {
1164 const bool IsByRef = Arg.hasByRefAttr();
1165 Type *BaseArgTy = Arg.getType();
1166 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1167 Align Alignment = DL.getValueOrABITypeAlignment(
1168 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1169 MaxAlign = std::max(Alignment, MaxAlign);
1170 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1171
1172 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1173 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1174
1175 // We're basically throwing away everything passed into us and starting over
1176 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1177 // to us as computed in Ins.
1178 //
1179 // We also need to figure out what type legalization is trying to do to get
1180 // the correct memory offsets.
1181
1182 SmallVector<EVT, 16> ValueVTs;
1184 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1185
1186 for (unsigned Value = 0, NumValues = ValueVTs.size();
1187 Value != NumValues; ++Value) {
1188 uint64_t BasePartOffset = Offsets[Value];
1189
1190 EVT ArgVT = ValueVTs[Value];
1191 EVT MemVT = ArgVT;
1192 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1193 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1194
1195 if (NumRegs == 1) {
1196 // This argument is not split, so the IR type is the memory type.
1197 if (ArgVT.isExtended()) {
1198 // We have an extended type, like i24, so we should just use the
1199 // register type.
1200 MemVT = RegisterVT;
1201 } else {
1202 MemVT = ArgVT;
1203 }
1204 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1205 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1206 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1207 // We have a vector value which has been split into a vector with
1208 // the same scalar type, but fewer elements. This should handle
1209 // all the floating-point vector types.
1210 MemVT = RegisterVT;
1211 } else if (ArgVT.isVector() &&
1212 ArgVT.getVectorNumElements() == NumRegs) {
1213 // This arg has been split so that each element is stored in a separate
1214 // register.
1215 MemVT = ArgVT.getScalarType();
1216 } else if (ArgVT.isExtended()) {
1217 // We have an extended type, like i65.
1218 MemVT = RegisterVT;
1219 } else {
1220 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1221 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1222 if (RegisterVT.isInteger()) {
1223 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1224 } else if (RegisterVT.isVector()) {
1225 assert(!RegisterVT.getScalarType().isFloatingPoint());
1226 unsigned NumElements = RegisterVT.getVectorNumElements();
1227 assert(MemoryBits % NumElements == 0);
1228 // This vector type has been split into another vector type with
1229 // a different elements size.
1230 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1231 MemoryBits / NumElements);
1232 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1233 } else {
1234 llvm_unreachable("cannot deduce memory type.");
1235 }
1236 }
1237
1238 // Convert one element vectors to scalar.
1239 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1240 MemVT = MemVT.getScalarType();
1241
1242 // Round up vec3/vec5 argument.
1243 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1244 MemVT = MemVT.getPow2VectorType(State.getContext());
1245 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1246 MemVT = MemVT.getRoundIntegerType(State.getContext());
1247 }
1248
1249 unsigned PartOffset = 0;
1250 for (unsigned i = 0; i != NumRegs; ++i) {
1251 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1252 BasePartOffset + PartOffset,
1253 MemVT.getSimpleVT(),
1255 PartOffset += MemVT.getStoreSize();
1256 }
1257 }
1258 }
1259}
1260
1262 SDValue Chain, CallingConv::ID CallConv,
1263 bool isVarArg,
1265 const SmallVectorImpl<SDValue> &OutVals,
1266 const SDLoc &DL, SelectionDAG &DAG) const {
1267 // FIXME: Fails for r600 tests
1268 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1269 // "wave terminate should not have return values");
1270 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1271}
1272
1273//===---------------------------------------------------------------------===//
1274// Target specific lowering
1275//===---------------------------------------------------------------------===//
1276
1277/// Selects the correct CCAssignFn for a given CallingConvention value.
1279 bool IsVarArg) {
1280 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1281}
1282
1284 bool IsVarArg) {
1286}
1287
1289 SelectionDAG &DAG,
1290 MachineFrameInfo &MFI,
1291 int ClobberedFI) const {
1292 SmallVector<SDValue, 8> ArgChains;
1293 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1294 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1295
1296 // Include the original chain at the beginning of the list. When this is
1297 // used by target LowerCall hooks, this helps legalize find the
1298 // CALLSEQ_BEGIN node.
1299 ArgChains.push_back(Chain);
1300
1301 // Add a chain value for each stack argument corresponding
1302 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1303 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1304 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1305 if (FI->getIndex() < 0) {
1306 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1307 int64_t InLastByte = InFirstByte;
1308 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1309
1310 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1311 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1312 ArgChains.push_back(SDValue(L, 1));
1313 }
1314 }
1315 }
1316 }
1317
1318 // Build a tokenfactor for all the chains.
1319 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1320}
1321
1324 StringRef Reason) const {
1325 SDValue Callee = CLI.Callee;
1326 SelectionDAG &DAG = CLI.DAG;
1327
1328 const Function &Fn = DAG.getMachineFunction().getFunction();
1329
1330 StringRef FuncName("<unknown>");
1331
1332 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1333 FuncName = G->getSymbol();
1334 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1335 FuncName = G->getGlobal()->getName();
1336
1338 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1339 DAG.getContext()->diagnose(NoCalls);
1340
1341 if (!CLI.IsTailCall) {
1342 for (ISD::InputArg &Arg : CLI.Ins)
1343 InVals.push_back(DAG.getUNDEF(Arg.VT));
1344 }
1345
1346 return DAG.getEntryNode();
1347}
1348
1350 SmallVectorImpl<SDValue> &InVals) const {
1351 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1352}
1353
1355 SelectionDAG &DAG) const {
1356 const Function &Fn = DAG.getMachineFunction().getFunction();
1357
1358 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1359 SDLoc(Op).getDebugLoc());
1360 DAG.getContext()->diagnose(NoDynamicAlloca);
1361 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1362 return DAG.getMergeValues(Ops, SDLoc());
1363}
1364
1366 SelectionDAG &DAG) const {
1367 switch (Op.getOpcode()) {
1368 default:
1369 Op->print(errs(), &DAG);
1370 llvm_unreachable("Custom lowering code for this "
1371 "instruction is not implemented yet!");
1372 break;
1374 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1376 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1377 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1378 case ISD::FREM: return LowerFREM(Op, DAG);
1379 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1380 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1381 case ISD::FRINT: return LowerFRINT(Op, DAG);
1382 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1383 case ISD::FROUNDEVEN:
1384 return LowerFROUNDEVEN(Op, DAG);
1385 case ISD::FROUND: return LowerFROUND(Op, DAG);
1386 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1387 case ISD::FLOG2:
1388 return LowerFLOG2(Op, DAG);
1389 case ISD::FLOG:
1390 case ISD::FLOG10:
1391 return LowerFLOGCommon(Op, DAG);
1392 case ISD::FEXP:
1393 case ISD::FEXP10:
1394 return lowerFEXP(Op, DAG);
1395 case ISD::FEXP2:
1396 return lowerFEXP2(Op, DAG);
1397 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1398 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1399 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1400 case ISD::FP_TO_SINT:
1401 case ISD::FP_TO_UINT:
1402 return LowerFP_TO_INT(Op, DAG);
1403 case ISD::CTTZ:
1405 case ISD::CTLZ:
1407 return LowerCTLZ_CTTZ(Op, DAG);
1409 }
1410 return Op;
1411}
1412
1415 SelectionDAG &DAG) const {
1416 switch (N->getOpcode()) {
1418 // Different parts of legalization seem to interpret which type of
1419 // sign_extend_inreg is the one to check for custom lowering. The extended
1420 // from type is what really matters, but some places check for custom
1421 // lowering of the result type. This results in trying to use
1422 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1423 // nothing here and let the illegal result integer be handled normally.
1424 return;
1425 case ISD::FLOG2:
1426 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1427 Results.push_back(Lowered);
1428 return;
1429 case ISD::FLOG:
1430 case ISD::FLOG10:
1431 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1432 Results.push_back(Lowered);
1433 return;
1434 case ISD::FEXP2:
1435 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1436 Results.push_back(Lowered);
1437 return;
1438 case ISD::FEXP:
1439 case ISD::FEXP10:
1440 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1441 Results.push_back(Lowered);
1442 return;
1443 case ISD::CTLZ:
1445 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1446 Results.push_back(Lowered);
1447 return;
1448 default:
1449 return;
1450 }
1451}
1452
1454 SDValue Op,
1455 SelectionDAG &DAG) const {
1456
1457 const DataLayout &DL = DAG.getDataLayout();
1458 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1459 const GlobalValue *GV = G->getGlobal();
1460
1461 if (!MFI->isModuleEntryFunction()) {
1462 if (std::optional<uint32_t> Address =
1464 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1465 }
1466 }
1467
1468 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1469 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1470 if (!MFI->isModuleEntryFunction() &&
1471 GV->getName() != "llvm.amdgcn.module.lds") {
1472 SDLoc DL(Op);
1473 const Function &Fn = DAG.getMachineFunction().getFunction();
1474 DiagnosticInfoUnsupported BadLDSDecl(
1475 Fn, "local memory global used by non-kernel function",
1476 DL.getDebugLoc(), DS_Warning);
1477 DAG.getContext()->diagnose(BadLDSDecl);
1478
1479 // We currently don't have a way to correctly allocate LDS objects that
1480 // aren't directly associated with a kernel. We do force inlining of
1481 // functions that use local objects. However, if these dead functions are
1482 // not eliminated, we don't want a compile time error. Just emit a warning
1483 // and a trap, since there should be no callable path here.
1484 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1485 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1486 Trap, DAG.getRoot());
1487 DAG.setRoot(OutputChain);
1488 return DAG.getUNDEF(Op.getValueType());
1489 }
1490
1491 // XXX: What does the value of G->getOffset() mean?
1492 assert(G->getOffset() == 0 &&
1493 "Do not know what to do with an non-zero offset");
1494
1495 // TODO: We could emit code to handle the initialization somewhere.
1496 // We ignore the initializer for now and legalize it to allow selection.
1497 // The initializer will anyway get errored out during assembly emission.
1498 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1499 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1500 }
1501 return SDValue();
1502}
1503
1505 SelectionDAG &DAG) const {
1507 SDLoc SL(Op);
1508
1509 EVT VT = Op.getValueType();
1510 if (VT.getVectorElementType().getSizeInBits() < 32) {
1511 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1512 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1513 unsigned NewNumElt = OpBitSize / 32;
1514 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1516 MVT::i32, NewNumElt);
1517 for (const SDUse &U : Op->ops()) {
1518 SDValue In = U.get();
1519 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1520 if (NewNumElt > 1)
1521 DAG.ExtractVectorElements(NewIn, Args);
1522 else
1523 Args.push_back(NewIn);
1524 }
1525
1526 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1527 NewNumElt * Op.getNumOperands());
1528 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1529 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1530 }
1531 }
1532
1533 for (const SDUse &U : Op->ops())
1534 DAG.ExtractVectorElements(U.get(), Args);
1535
1536 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1537}
1538
1540 SelectionDAG &DAG) const {
1541 SDLoc SL(Op);
1543 unsigned Start = Op.getConstantOperandVal(1);
1544 EVT VT = Op.getValueType();
1545 EVT SrcVT = Op.getOperand(0).getValueType();
1546
1547 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1548 unsigned NumElt = VT.getVectorNumElements();
1549 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1550 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1551
1552 // Extract 32-bit registers at a time.
1553 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1554 EVT NewVT = NumElt == 2
1555 ? MVT::i32
1556 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1557 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1558
1559 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1560 if (NumElt == 2)
1561 Tmp = Args[0];
1562 else
1563 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1564
1565 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1566 }
1567
1568 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1570
1571 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1572}
1573
1574// TODO: Handle fabs too
1576 if (Val.getOpcode() == ISD::FNEG)
1577 return Val.getOperand(0);
1578
1579 return Val;
1580}
1581
1583 if (Val.getOpcode() == ISD::FNEG)
1584 Val = Val.getOperand(0);
1585 if (Val.getOpcode() == ISD::FABS)
1586 Val = Val.getOperand(0);
1587 if (Val.getOpcode() == ISD::FCOPYSIGN)
1588 Val = Val.getOperand(0);
1589 return Val;
1590}
1591
1593 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1594 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1595 SelectionDAG &DAG = DCI.DAG;
1596 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1597 switch (CCOpcode) {
1598 case ISD::SETOEQ:
1599 case ISD::SETONE:
1600 case ISD::SETUNE:
1601 case ISD::SETNE:
1602 case ISD::SETUEQ:
1603 case ISD::SETEQ:
1604 case ISD::SETFALSE:
1605 case ISD::SETFALSE2:
1606 case ISD::SETTRUE:
1607 case ISD::SETTRUE2:
1608 case ISD::SETUO:
1609 case ISD::SETO:
1610 break;
1611 case ISD::SETULE:
1612 case ISD::SETULT: {
1613 if (LHS == True)
1614 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1615 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1616 }
1617 case ISD::SETOLE:
1618 case ISD::SETOLT:
1619 case ISD::SETLE:
1620 case ISD::SETLT: {
1621 // Ordered. Assume ordered for undefined.
1622
1623 // Only do this after legalization to avoid interfering with other combines
1624 // which might occur.
1626 !DCI.isCalledByLegalizer())
1627 return SDValue();
1628
1629 // We need to permute the operands to get the correct NaN behavior. The
1630 // selected operand is the second one based on the failing compare with NaN,
1631 // so permute it based on the compare type the hardware uses.
1632 if (LHS == True)
1633 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1634 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1635 }
1636 case ISD::SETUGE:
1637 case ISD::SETUGT: {
1638 if (LHS == True)
1639 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1640 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1641 }
1642 case ISD::SETGT:
1643 case ISD::SETGE:
1644 case ISD::SETOGE:
1645 case ISD::SETOGT: {
1647 !DCI.isCalledByLegalizer())
1648 return SDValue();
1649
1650 if (LHS == True)
1651 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1652 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1653 }
1654 case ISD::SETCC_INVALID:
1655 llvm_unreachable("Invalid setcc condcode!");
1656 }
1657 return SDValue();
1658}
1659
1660/// Generate Min/Max node
1662 SDValue LHS, SDValue RHS,
1663 SDValue True, SDValue False,
1664 SDValue CC,
1665 DAGCombinerInfo &DCI) const {
1666 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1667 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1668
1669 SelectionDAG &DAG = DCI.DAG;
1670
1671 // If we can't directly match this, try to see if we can fold an fneg to
1672 // match.
1673
1674 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1675 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1676 SDValue NegTrue = peekFNeg(True);
1677
1678 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1679 // fmin/fmax.
1680 //
1681 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1682 // -> fneg (fmin_legacy lhs, K)
1683 //
1684 // TODO: Use getNegatedExpression
1685 if (LHS == NegTrue && CFalse && CRHS) {
1686 APFloat NegRHS = neg(CRHS->getValueAPF());
1687 if (NegRHS == CFalse->getValueAPF()) {
1688 SDValue Combined =
1689 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1690 if (Combined)
1691 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1692 return SDValue();
1693 }
1694 }
1695
1696 return SDValue();
1697}
1698
1699std::pair<SDValue, SDValue>
1701 SDLoc SL(Op);
1702
1703 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1704
1705 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1706 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1707
1708 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1709 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1710
1711 return std::pair(Lo, Hi);
1712}
1713
1715 SDLoc SL(Op);
1716
1717 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1718 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1719 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1720}
1721
1723 SDLoc SL(Op);
1724
1725 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1726 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1727 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1728}
1729
1730// Split a vector type into two parts. The first part is a power of two vector.
1731// The second part is whatever is left over, and is a scalar if it would
1732// otherwise be a 1-vector.
1733std::pair<EVT, EVT>
1735 EVT LoVT, HiVT;
1736 EVT EltVT = VT.getVectorElementType();
1737 unsigned NumElts = VT.getVectorNumElements();
1738 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1739 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1740 HiVT = NumElts - LoNumElts == 1
1741 ? EltVT
1742 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1743 return std::pair(LoVT, HiVT);
1744}
1745
1746// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1747// scalar.
1748std::pair<SDValue, SDValue>
1750 const EVT &LoVT, const EVT &HiVT,
1751 SelectionDAG &DAG) const {
1753 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1754 N.getValueType().getVectorNumElements() &&
1755 "More vector elements requested than available!");
1757 DAG.getVectorIdxConstant(0, DL));
1758 SDValue Hi = DAG.getNode(
1760 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1761 return std::pair(Lo, Hi);
1762}
1763
1765 SelectionDAG &DAG) const {
1766 LoadSDNode *Load = cast<LoadSDNode>(Op);
1767 EVT VT = Op.getValueType();
1768 SDLoc SL(Op);
1769
1770
1771 // If this is a 2 element vector, we really want to scalarize and not create
1772 // weird 1 element vectors.
1773 if (VT.getVectorNumElements() == 2) {
1774 SDValue Ops[2];
1775 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1776 return DAG.getMergeValues(Ops, SL);
1777 }
1778
1779 SDValue BasePtr = Load->getBasePtr();
1780 EVT MemVT = Load->getMemoryVT();
1781
1782 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1783
1784 EVT LoVT, HiVT;
1785 EVT LoMemVT, HiMemVT;
1786 SDValue Lo, Hi;
1787
1788 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1789 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1790 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1791
1792 unsigned Size = LoMemVT.getStoreSize();
1793 Align BaseAlign = Load->getAlign();
1794 Align HiAlign = commonAlignment(BaseAlign, Size);
1795
1796 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1797 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1798 BaseAlign, Load->getMemOperand()->getFlags());
1799 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1800 SDValue HiLoad =
1801 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1802 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1803 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1804
1805 SDValue Join;
1806 if (LoVT == HiVT) {
1807 // This is the case that the vector is power of two so was evenly split.
1808 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1809 } else {
1810 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1811 DAG.getVectorIdxConstant(0, SL));
1812 Join = DAG.getNode(
1814 VT, Join, HiLoad,
1816 }
1817
1818 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1819 LoLoad.getValue(1), HiLoad.getValue(1))};
1820
1821 return DAG.getMergeValues(Ops, SL);
1822}
1823
1825 SelectionDAG &DAG) const {
1826 LoadSDNode *Load = cast<LoadSDNode>(Op);
1827 EVT VT = Op.getValueType();
1828 SDValue BasePtr = Load->getBasePtr();
1829 EVT MemVT = Load->getMemoryVT();
1830 SDLoc SL(Op);
1831 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1832 Align BaseAlign = Load->getAlign();
1833 unsigned NumElements = MemVT.getVectorNumElements();
1834
1835 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1836 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1837 if (NumElements != 3 ||
1838 (BaseAlign < Align(8) &&
1839 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1840 return SplitVectorLoad(Op, DAG);
1841
1842 assert(NumElements == 3);
1843
1844 EVT WideVT =
1846 EVT WideMemVT =
1848 SDValue WideLoad = DAG.getExtLoad(
1849 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1850 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1851 return DAG.getMergeValues(
1852 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1853 DAG.getVectorIdxConstant(0, SL)),
1854 WideLoad.getValue(1)},
1855 SL);
1856}
1857
1859 SelectionDAG &DAG) const {
1860 StoreSDNode *Store = cast<StoreSDNode>(Op);
1861 SDValue Val = Store->getValue();
1862 EVT VT = Val.getValueType();
1863
1864 // If this is a 2 element vector, we really want to scalarize and not create
1865 // weird 1 element vectors.
1866 if (VT.getVectorNumElements() == 2)
1867 return scalarizeVectorStore(Store, DAG);
1868
1869 EVT MemVT = Store->getMemoryVT();
1870 SDValue Chain = Store->getChain();
1871 SDValue BasePtr = Store->getBasePtr();
1872 SDLoc SL(Op);
1873
1874 EVT LoVT, HiVT;
1875 EVT LoMemVT, HiMemVT;
1876 SDValue Lo, Hi;
1877
1878 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1879 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1880 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1881
1882 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1883
1884 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1885 Align BaseAlign = Store->getAlign();
1886 unsigned Size = LoMemVT.getStoreSize();
1887 Align HiAlign = commonAlignment(BaseAlign, Size);
1888
1889 SDValue LoStore =
1890 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1891 Store->getMemOperand()->getFlags());
1892 SDValue HiStore =
1893 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1894 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1895
1896 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1897}
1898
1899// This is a shortcut for integer division because we have fast i32<->f32
1900// conversions, and fast f32 reciprocal instructions. The fractional part of a
1901// float is enough to accurately represent up to a 24-bit signed integer.
1903 bool Sign) const {
1904 SDLoc DL(Op);
1905 EVT VT = Op.getValueType();
1906 SDValue LHS = Op.getOperand(0);
1907 SDValue RHS = Op.getOperand(1);
1908 MVT IntVT = MVT::i32;
1909 MVT FltVT = MVT::f32;
1910
1911 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1912 if (LHSSignBits < 9)
1913 return SDValue();
1914
1915 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1916 if (RHSSignBits < 9)
1917 return SDValue();
1918
1919 unsigned BitSize = VT.getSizeInBits();
1920 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1921 unsigned DivBits = BitSize - SignBits;
1922 if (Sign)
1923 ++DivBits;
1924
1927
1928 SDValue jq = DAG.getConstant(1, DL, IntVT);
1929
1930 if (Sign) {
1931 // char|short jq = ia ^ ib;
1932 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1933
1934 // jq = jq >> (bitsize - 2)
1935 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1936 DAG.getConstant(BitSize - 2, DL, VT));
1937
1938 // jq = jq | 0x1
1939 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1940 }
1941
1942 // int ia = (int)LHS;
1943 SDValue ia = LHS;
1944
1945 // int ib, (int)RHS;
1946 SDValue ib = RHS;
1947
1948 // float fa = (float)ia;
1949 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1950
1951 // float fb = (float)ib;
1952 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1953
1954 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1955 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1956
1957 // fq = trunc(fq);
1958 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1959
1960 // float fqneg = -fq;
1961 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1962
1964
1965 bool UseFmadFtz = false;
1966 if (Subtarget->isGCN()) {
1968 UseFmadFtz =
1970 }
1971
1972 // float fr = mad(fqneg, fb, fa);
1973 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1974 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1976 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1977
1978 // int iq = (int)fq;
1979 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1980
1981 // fr = fabs(fr);
1982 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1983
1984 // fb = fabs(fb);
1985 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1986
1987 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1988
1989 // int cv = fr >= fb;
1990 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1991
1992 // jq = (cv ? jq : 0);
1993 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1994
1995 // dst = iq + jq;
1996 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1997
1998 // Rem needs compensation, it's easier to recompute it
1999 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2000 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2001
2002 // Truncate to number of bits this divide really is.
2003 if (Sign) {
2004 SDValue InRegSize
2005 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2006 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2007 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2008 } else {
2009 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2010 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2011 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2012 }
2013
2014 return DAG.getMergeValues({ Div, Rem }, DL);
2015}
2016
2018 SelectionDAG &DAG,
2020 SDLoc DL(Op);
2021 EVT VT = Op.getValueType();
2022
2023 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2024
2025 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2026
2027 SDValue One = DAG.getConstant(1, DL, HalfVT);
2028 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2029
2030 //HiLo split
2031 SDValue LHS_Lo, LHS_Hi;
2032 SDValue LHS = Op.getOperand(0);
2033 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2034
2035 SDValue RHS_Lo, RHS_Hi;
2036 SDValue RHS = Op.getOperand(1);
2037 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2038
2039 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2041
2042 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2043 LHS_Lo, RHS_Lo);
2044
2045 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2046 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2047
2048 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2049 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2050 return;
2051 }
2052
2053 if (isTypeLegal(MVT::i64)) {
2054 // The algorithm here is based on ideas from "Software Integer Division",
2055 // Tom Rodeheffer, August 2008.
2056
2059
2060 // Compute denominator reciprocal.
2061 unsigned FMAD =
2062 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2065 : (unsigned)AMDGPUISD::FMAD_FTZ;
2066
2067 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2068 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2069 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2070 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2071 Cvt_Lo);
2072 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2073 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2074 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2075 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2076 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2077 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2078 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2079 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2080 Mul1);
2081 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2082 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2083 SDValue Rcp64 = DAG.getBitcast(VT,
2084 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2085
2086 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2087 SDValue One64 = DAG.getConstant(1, DL, VT);
2088 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2089 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2090
2091 // First round of UNR (Unsigned integer Newton-Raphson).
2092 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2093 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2094 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2095 SDValue Mulhi1_Lo, Mulhi1_Hi;
2096 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2097 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2098 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2099 Mulhi1_Lo, Zero1);
2100 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2101 Mulhi1_Hi, Add1_Lo.getValue(1));
2102 SDValue Add1 = DAG.getBitcast(VT,
2103 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2104
2105 // Second round of UNR.
2106 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2107 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2108 SDValue Mulhi2_Lo, Mulhi2_Hi;
2109 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2110 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2111 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2112 Mulhi2_Lo, Zero1);
2113 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2114 Mulhi2_Hi, Add2_Lo.getValue(1));
2115 SDValue Add2 = DAG.getBitcast(VT,
2116 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2117
2118 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2119
2120 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2121
2122 SDValue Mul3_Lo, Mul3_Hi;
2123 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2124 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2125 Mul3_Lo, Zero1);
2126 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2127 Mul3_Hi, Sub1_Lo.getValue(1));
2128 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2129 SDValue Sub1 = DAG.getBitcast(VT,
2130 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2131
2132 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2133 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2134 ISD::SETUGE);
2135 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2136 ISD::SETUGE);
2137 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2138
2139 // TODO: Here and below portions of the code can be enclosed into if/endif.
2140 // Currently control flow is unconditional and we have 4 selects after
2141 // potential endif to substitute PHIs.
2142
2143 // if C3 != 0 ...
2144 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2145 RHS_Lo, Zero1);
2146 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2147 RHS_Hi, Sub1_Lo.getValue(1));
2148 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2149 Zero, Sub2_Lo.getValue(1));
2150 SDValue Sub2 = DAG.getBitcast(VT,
2151 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2152
2153 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2154
2155 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2156 ISD::SETUGE);
2157 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2158 ISD::SETUGE);
2159 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2160
2161 // if (C6 != 0)
2162 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2163
2164 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2165 RHS_Lo, Zero1);
2166 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2167 RHS_Hi, Sub2_Lo.getValue(1));
2168 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2169 Zero, Sub3_Lo.getValue(1));
2170 SDValue Sub3 = DAG.getBitcast(VT,
2171 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2172
2173 // endif C6
2174 // endif C3
2175
2176 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2177 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2178
2179 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2180 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2181
2182 Results.push_back(Div);
2183 Results.push_back(Rem);
2184
2185 return;
2186 }
2187
2188 // r600 expandion.
2189 // Get Speculative values
2190 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2191 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2192
2193 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2194 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2195 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2196
2197 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2198 SDValue DIV_Lo = Zero;
2199
2200 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2201
2202 for (unsigned i = 0; i < halfBitWidth; ++i) {
2203 const unsigned bitPos = halfBitWidth - i - 1;
2204 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2205 // Get value of high bit
2206 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2207 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2208 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2209
2210 // Shift
2211 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2212 // Add LHS high bit
2213 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2214
2215 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2216 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2217
2218 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2219
2220 // Update REM
2221 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2222 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2223 }
2224
2225 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2226 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2227 Results.push_back(DIV);
2228 Results.push_back(REM);
2229}
2230
2232 SelectionDAG &DAG) const {
2233 SDLoc DL(Op);
2234 EVT VT = Op.getValueType();
2235
2236 if (VT == MVT::i64) {
2238 LowerUDIVREM64(Op, DAG, Results);
2239 return DAG.getMergeValues(Results, DL);
2240 }
2241
2242 if (VT == MVT::i32) {
2243 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2244 return Res;
2245 }
2246
2247 SDValue X = Op.getOperand(0);
2248 SDValue Y = Op.getOperand(1);
2249
2250 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2251 // algorithm used here.
2252
2253 // Initial estimate of inv(y).
2254 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2255
2256 // One round of UNR.
2257 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2258 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2259 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2260 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2261
2262 // Quotient/remainder estimate.
2263 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2264 SDValue R =
2265 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2266
2267 // First quotient/remainder refinement.
2268 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2269 SDValue One = DAG.getConstant(1, DL, VT);
2270 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2271 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2272 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2273 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2274 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2275
2276 // Second quotient/remainder refinement.
2277 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2278 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2279 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2280 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2281 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2282
2283 return DAG.getMergeValues({Q, R}, DL);
2284}
2285
2287 SelectionDAG &DAG) const {
2288 SDLoc DL(Op);
2289 EVT VT = Op.getValueType();
2290
2291 SDValue LHS = Op.getOperand(0);
2292 SDValue RHS = Op.getOperand(1);
2293
2294 SDValue Zero = DAG.getConstant(0, DL, VT);
2295 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2296
2297 if (VT == MVT::i32) {
2298 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2299 return Res;
2300 }
2301
2302 if (VT == MVT::i64 &&
2303 DAG.ComputeNumSignBits(LHS) > 32 &&
2304 DAG.ComputeNumSignBits(RHS) > 32) {
2305 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2306
2307 //HiLo split
2308 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2309 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2310 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2311 LHS_Lo, RHS_Lo);
2312 SDValue Res[2] = {
2313 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2314 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2315 };
2316 return DAG.getMergeValues(Res, DL);
2317 }
2318
2319 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2320 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2321 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2322 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2323
2324 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2325 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2326
2327 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2328 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2329
2330 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2331 SDValue Rem = Div.getValue(1);
2332
2333 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2334 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2335
2336 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2337 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2338
2339 SDValue Res[2] = {
2340 Div,
2341 Rem
2342 };
2343 return DAG.getMergeValues(Res, DL);
2344}
2345
2346// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2348 SDLoc SL(Op);
2349 EVT VT = Op.getValueType();
2350 auto Flags = Op->getFlags();
2351 SDValue X = Op.getOperand(0);
2352 SDValue Y = Op.getOperand(1);
2353
2354 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2355 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2356 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2357 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2358 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2359}
2360
2362 SDLoc SL(Op);
2363 SDValue Src = Op.getOperand(0);
2364
2365 // result = trunc(src)
2366 // if (src > 0.0 && src != result)
2367 // result += 1.0
2368
2369 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2370
2371 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2372 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2373
2374 EVT SetCCVT =
2375 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2376
2377 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2378 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2379 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2380
2381 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2382 // TODO: Should this propagate fast-math-flags?
2383 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2384}
2385
2387 SelectionDAG &DAG) {
2388 const unsigned FractBits = 52;
2389 const unsigned ExpBits = 11;
2390
2391 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2392 Hi,
2393 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2394 DAG.getConstant(ExpBits, SL, MVT::i32));
2395 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2396 DAG.getConstant(1023, SL, MVT::i32));
2397
2398 return Exp;
2399}
2400
2402 SDLoc SL(Op);
2403 SDValue Src = Op.getOperand(0);
2404
2405 assert(Op.getValueType() == MVT::f64);
2406
2407 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2408
2409 // Extract the upper half, since this is where we will find the sign and
2410 // exponent.
2411 SDValue Hi = getHiHalf64(Src, DAG);
2412
2413 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2414
2415 const unsigned FractBits = 52;
2416
2417 // Extract the sign bit.
2418 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2419 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2420
2421 // Extend back to 64-bits.
2422 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2423 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2424
2425 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2426 const SDValue FractMask
2427 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2428
2429 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2430 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2431 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2432
2433 EVT SetCCVT =
2434 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2435
2436 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2437
2438 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2439 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2440
2441 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2442 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2443
2444 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2445}
2446
2448 SelectionDAG &DAG) const {
2449 SDLoc SL(Op);
2450 SDValue Src = Op.getOperand(0);
2451
2452 assert(Op.getValueType() == MVT::f64);
2453
2454 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2455 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2456 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2457
2458 // TODO: Should this propagate fast-math-flags?
2459
2460 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2461 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2462
2463 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2464
2465 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2466 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2467
2468 EVT SetCCVT =
2469 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2470 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2471
2472 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2473}
2474
2476 SelectionDAG &DAG) const {
2477 // FNEARBYINT and FRINT are the same, except in their handling of FP
2478 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2479 // rint, so just treat them as equivalent.
2480 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2481 Op.getOperand(0));
2482}
2483
2485 auto VT = Op.getValueType();
2486 auto Arg = Op.getOperand(0u);
2487 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2488}
2489
2490// XXX - May require not supporting f32 denormals?
2491
2492// Don't handle v2f16. The extra instructions to scalarize and repack around the
2493// compare and vselect end up producing worse code than scalarizing the whole
2494// operation.
2496 SDLoc SL(Op);
2497 SDValue X = Op.getOperand(0);
2498 EVT VT = Op.getValueType();
2499
2500 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2501
2502 // TODO: Should this propagate fast-math-flags?
2503
2504 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2505
2506 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2507
2508 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2509 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2510
2511 EVT SetCCVT =
2512 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2513
2514 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2515 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2516 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2517
2518 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2519 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2520}
2521
2523 SDLoc SL(Op);
2524 SDValue Src = Op.getOperand(0);
2525
2526 // result = trunc(src);
2527 // if (src < 0.0 && src != result)
2528 // result += -1.0.
2529
2530 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2531
2532 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2533 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2534
2535 EVT SetCCVT =
2536 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2537
2538 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2539 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2540 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2541
2542 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2543 // TODO: Should this propagate fast-math-flags?
2544 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2545}
2546
2547/// Return true if it's known that \p Src can never be an f32 denormal value.
2549 switch (Src.getOpcode()) {
2550 case ISD::FP_EXTEND:
2551 return Src.getOperand(0).getValueType() == MVT::f16;
2552 case ISD::FP16_TO_FP:
2553 case ISD::FFREXP:
2554 return true;
2556 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2557 switch (IntrinsicID) {
2558 case Intrinsic::amdgcn_frexp_mant:
2559 return true;
2560 default:
2561 return false;
2562 }
2563 }
2564 default:
2565 return false;
2566 }
2567
2568 llvm_unreachable("covered opcode switch");
2569}
2570
2572 SDNodeFlags Flags) {
2573 if (Flags.hasApproximateFuncs())
2574 return true;
2575 auto &Options = DAG.getTarget().Options;
2576 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2577}
2578
2580 SDValue Src,
2581 SDNodeFlags Flags) {
2582 return !valueIsKnownNeverF32Denorm(Src) &&
2583 DAG.getMachineFunction()
2586}
2587
2589 SDValue Src,
2590 SDNodeFlags Flags) const {
2591 SDLoc SL(Src);
2592 EVT VT = Src.getValueType();
2593 const fltSemantics &Semantics = VT.getFltSemantics();
2594 SDValue SmallestNormal =
2595 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2596
2597 // Want to scale denormals up, but negatives and 0 work just as well on the
2598 // scaled path.
2599 SDValue IsLtSmallestNormal = DAG.getSetCC(
2600 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2601 SmallestNormal, ISD::SETOLT);
2602
2603 return IsLtSmallestNormal;
2604}
2605
2607 SDNodeFlags Flags) const {
2608 SDLoc SL(Src);
2609 EVT VT = Src.getValueType();
2610 const fltSemantics &Semantics = VT.getFltSemantics();
2611 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2612
2613 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2614 SDValue IsFinite = DAG.getSetCC(
2615 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2616 Inf, ISD::SETOLT);
2617 return IsFinite;
2618}
2619
2620/// If denormal handling is required return the scaled input to FLOG2, and the
2621/// check for denormal range. Otherwise, return null values.
2622std::pair<SDValue, SDValue>
2624 SDValue Src, SDNodeFlags Flags) const {
2625 if (!needsDenormHandlingF32(DAG, Src, Flags))
2626 return {};
2627
2628 MVT VT = MVT::f32;
2629 const fltSemantics &Semantics = APFloat::IEEEsingle();
2630 SDValue SmallestNormal =
2631 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2632
2633 SDValue IsLtSmallestNormal = DAG.getSetCC(
2634 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2635 SmallestNormal, ISD::SETOLT);
2636
2637 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2638 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2639 SDValue ScaleFactor =
2640 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2641
2642 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2643 return {ScaledInput, IsLtSmallestNormal};
2644}
2645
2647 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2648 // If we have to handle denormals, scale up the input and adjust the result.
2649
2650 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2651 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2652
2653 SDLoc SL(Op);
2654 EVT VT = Op.getValueType();
2655 SDValue Src = Op.getOperand(0);
2656 SDNodeFlags Flags = Op->getFlags();
2657
2658 if (VT == MVT::f16) {
2659 // Nothing in half is a denormal when promoted to f32.
2660 assert(!Subtarget->has16BitInsts());
2661 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2662 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2663 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2664 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2665 }
2666
2667 auto [ScaledInput, IsLtSmallestNormal] =
2668 getScaledLogInput(DAG, SL, Src, Flags);
2669 if (!ScaledInput)
2670 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2671
2672 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2673
2674 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2675 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2676 SDValue ResultOffset =
2677 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2678 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2679}
2680
2681static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2682 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2683 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2684 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2685}
2686
2688 SelectionDAG &DAG) const {
2689 SDValue X = Op.getOperand(0);
2690 EVT VT = Op.getValueType();
2691 SDNodeFlags Flags = Op->getFlags();
2692 SDLoc DL(Op);
2693
2694 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2695 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2696
2697 const auto &Options = getTargetMachine().Options;
2698 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2699 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2700
2701 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2702 // Log and multiply in f32 is good enough for f16.
2703 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2704 }
2705
2706 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2707 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2708 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2709 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2710 }
2711
2712 return Lowered;
2713 }
2714
2715 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2716 if (ScaledInput)
2717 X = ScaledInput;
2718
2719 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2720
2721 SDValue R;
2722 if (Subtarget->hasFastFMAF32()) {
2723 // c+cc are ln(2)/ln(10) to more than 49 bits
2724 const float c_log10 = 0x1.344134p-2f;
2725 const float cc_log10 = 0x1.09f79ep-26f;
2726
2727 // c + cc is ln(2) to more than 49 bits
2728 const float c_log = 0x1.62e42ep-1f;
2729 const float cc_log = 0x1.efa39ep-25f;
2730
2731 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2732 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2733
2734 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2735 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2736 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2737 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2738 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2739 } else {
2740 // ch+ct is ln(2)/ln(10) to more than 36 bits
2741 const float ch_log10 = 0x1.344000p-2f;
2742 const float ct_log10 = 0x1.3509f6p-18f;
2743
2744 // ch + ct is ln(2) to more than 36 bits
2745 const float ch_log = 0x1.62e000p-1f;
2746 const float ct_log = 0x1.0bfbe8p-15f;
2747
2748 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2749 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2750
2751 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2752 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2753 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2754 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2755 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2756
2757 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2758 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2759 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2760 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2761 }
2762
2763 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2764 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2765
2766 // TODO: Check if known finite from source value.
2767 if (!IsFiniteOnly) {
2768 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2769 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2770 }
2771
2772 if (IsScaled) {
2773 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2774 SDValue ShiftK =
2775 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2776 SDValue Shift =
2777 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2778 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2779 }
2780
2781 return R;
2782}
2783
2785 return LowerFLOGCommon(Op, DAG);
2786}
2787
2788// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2789// promote f16 operation.
2791 SelectionDAG &DAG, bool IsLog10,
2792 SDNodeFlags Flags) const {
2793 EVT VT = Src.getValueType();
2794 unsigned LogOp =
2795 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2796
2797 double Log2BaseInverted =
2799
2800 if (VT == MVT::f32) {
2801 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2802 if (ScaledInput) {
2803 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2804 SDValue ScaledResultOffset =
2805 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2806
2807 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2808
2809 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2810 ScaledResultOffset, Zero, Flags);
2811
2812 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2813
2814 if (Subtarget->hasFastFMAF32())
2815 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2816 Flags);
2817 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2818 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2819 }
2820 }
2821
2822 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2823 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2824
2825 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2826 Flags);
2827}
2828
2830 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2831 // If we have to handle denormals, scale up the input and adjust the result.
2832
2833 SDLoc SL(Op);
2834 EVT VT = Op.getValueType();
2835 SDValue Src = Op.getOperand(0);
2836 SDNodeFlags Flags = Op->getFlags();
2837
2838 if (VT == MVT::f16) {
2839 // Nothing in half is a denormal when promoted to f32.
2840 assert(!Subtarget->has16BitInsts());
2841 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2842 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2843 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2844 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2845 }
2846
2847 assert(VT == MVT::f32);
2848
2849 if (!needsDenormHandlingF32(DAG, Src, Flags))
2850 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2851
2852 // bool needs_scaling = x < -0x1.f80000p+6f;
2853 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2854
2855 // -nextafter(128.0, -1)
2856 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2857
2858 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2859
2860 SDValue NeedsScaling =
2861 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2862
2863 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2864 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2865
2866 SDValue AddOffset =
2867 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2868
2869 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2870 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2871
2872 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2873 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2874 SDValue ResultScale =
2875 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2876
2877 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2878}
2879
2881 SelectionDAG &DAG,
2882 SDNodeFlags Flags) const {
2883 EVT VT = X.getValueType();
2884 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2885
2886 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2887 // exp2(M_LOG2E_F * f);
2888 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2889 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2890 : (unsigned)ISD::FEXP2,
2891 SL, VT, Mul, Flags);
2892 }
2893
2894 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2895
2896 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2897 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2898
2899 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2900
2901 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2902
2903 SDValue AdjustedX =
2904 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2905
2906 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2907
2908 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2909
2910 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2911 SDValue AdjustedResult =
2912 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2913
2914 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2915 Flags);
2916}
2917
2918/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2919/// handled correctly.
2921 SelectionDAG &DAG,
2922 SDNodeFlags Flags) const {
2923 const EVT VT = X.getValueType();
2924 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2925
2926 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2927 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2928 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2929 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2930
2931 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2932 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2933 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2934 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2935 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2936 }
2937
2938 // bool s = x < -0x1.2f7030p+5f;
2939 // x += s ? 0x1.0p+5f : 0.0f;
2940 // exp10 = exp2(x * 0x1.a92000p+1f) *
2941 // exp2(x * 0x1.4f0978p-11f) *
2942 // (s ? 0x1.9f623ep-107f : 1.0f);
2943
2944 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2945
2946 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2947 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2948
2949 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2950 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2951 SDValue AdjustedX =
2952 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2953
2954 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2955 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2956
2957 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2958 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2959 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2960 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2961
2962 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2963
2964 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2965 SDValue AdjustedResult =
2966 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2967
2968 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2969 Flags);
2970}
2971
2973 EVT VT = Op.getValueType();
2974 SDLoc SL(Op);
2975 SDValue X = Op.getOperand(0);
2976 SDNodeFlags Flags = Op->getFlags();
2977 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2978
2979 if (VT.getScalarType() == MVT::f16) {
2980 // v_exp_f16 (fmul x, log2e)
2981 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2982 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2983
2984 if (VT.isVector())
2985 return SDValue();
2986
2987 // exp(f16 x) ->
2988 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2989
2990 // Nothing in half is a denormal when promoted to f32.
2991 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2992 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2993 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2994 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2995 }
2996
2997 assert(VT == MVT::f32);
2998
2999 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3000 // library behavior. Also, is known-not-daz source sufficient?
3001 if (allowApproxFunc(DAG, Flags)) {
3002 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3003 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3004 }
3005
3006 // Algorithm:
3007 //
3008 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3009 //
3010 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3011 // n = 64*m + j, 0 <= j < 64
3012 //
3013 // e^x = 2^((64*m + j + f)/64)
3014 // = (2^m) * (2^(j/64)) * 2^(f/64)
3015 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3016 //
3017 // f = x*(64/ln(2)) - n
3018 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3019 //
3020 // e^x = (2^m) * (2^(j/64)) * e^r
3021 //
3022 // (2^(j/64)) is precomputed
3023 //
3024 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3025 // e^r = 1 + q
3026 //
3027 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3028 //
3029 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3030 SDNodeFlags FlagsNoContract = Flags;
3031 FlagsNoContract.setAllowContract(false);
3032
3033 SDValue PH, PL;
3034 if (Subtarget->hasFastFMAF32()) {
3035 const float c_exp = numbers::log2ef;
3036 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3037 const float c_exp10 = 0x1.a934f0p+1f;
3038 const float cc_exp10 = 0x1.2f346ep-24f;
3039
3040 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3041 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3042
3043 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3044 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3045 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3046 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3047 } else {
3048 const float ch_exp = 0x1.714000p+0f;
3049 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3050
3051 const float ch_exp10 = 0x1.a92000p+1f;
3052 const float cl_exp10 = 0x1.4f0978p-11f;
3053
3054 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3055 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3056
3057 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3058 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3059 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3060 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3061 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3062
3063 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3064
3065 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3066 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3067 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3068 }
3069
3070 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3071
3072 // It is unsafe to contract this fsub into the PH multiply.
3073 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3074
3075 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3076 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3077 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3078
3079 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3080
3081 SDValue UnderflowCheckConst =
3082 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3083
3084 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3085 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3086 SDValue Underflow =
3087 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3088
3089 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3090 const auto &Options = getTargetMachine().Options;
3091
3092 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3093 SDValue OverflowCheckConst =
3094 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3095 SDValue Overflow =
3096 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3097 SDValue Inf =
3099 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3100 }
3101
3102 return R;
3103}
3104
3105static bool isCtlzOpc(unsigned Opc) {
3106 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3107}
3108
3109static bool isCttzOpc(unsigned Opc) {
3110 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3111}
3112
3114 SelectionDAG &DAG) const {
3115 auto SL = SDLoc(Op);
3116 auto Opc = Op.getOpcode();
3117 auto Arg = Op.getOperand(0u);
3118 auto ResultVT = Op.getValueType();
3119
3120 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3121 return {};
3122
3123 assert(isCtlzOpc(Opc));
3124 assert(ResultVT == Arg.getValueType());
3125
3126 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3127 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3128 SDValue NewOp;
3129
3130 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3131 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3132 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3133 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3134 } else {
3135 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3136 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3137 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3138 }
3139
3140 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3141}
3142
3144 SDLoc SL(Op);
3145 SDValue Src = Op.getOperand(0);
3146
3147 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3148 bool Ctlz = isCtlzOpc(Op.getOpcode());
3149 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3150
3151 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3152 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3153 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3154
3155 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3156 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3157 // (cttz hi:lo) -> (umin (ffbl src), 32)
3158 // (ctlz_zero_undef src) -> (ffbh src)
3159 // (cttz_zero_undef src) -> (ffbl src)
3160
3161 // 64-bit scalar version produce 32-bit result
3162 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3163 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3164 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3165 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3166 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3167 if (!ZeroUndef) {
3168 const SDValue ConstVal = DAG.getConstant(
3169 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3170 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3171 }
3172 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3173 }
3174
3175 SDValue Lo, Hi;
3176 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3177
3178 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3179 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3180
3181 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3182 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3183 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3184 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3185
3186 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3187 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3188 if (Ctlz)
3189 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3190 else
3191 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3192
3193 SDValue NewOpr;
3194 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3195 if (!ZeroUndef) {
3196 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3197 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3198 }
3199
3200 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3201}
3202
3204 bool Signed) const {
3205 // The regular method converting a 64-bit integer to float roughly consists of
3206 // 2 steps: normalization and rounding. In fact, after normalization, the
3207 // conversion from a 64-bit integer to a float is essentially the same as the
3208 // one from a 32-bit integer. The only difference is that it has more
3209 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3210 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3211 // converted into the correct float number. The basic steps for the unsigned
3212 // conversion are illustrated in the following pseudo code:
3213 //
3214 // f32 uitofp(i64 u) {
3215 // i32 hi, lo = split(u);
3216 // // Only count the leading zeros in hi as we have native support of the
3217 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3218 // // reduced to a 32-bit one automatically.
3219 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3220 // u <<= shamt;
3221 // hi, lo = split(u);
3222 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3223 // // convert it as a 32-bit integer and scale the result back.
3224 // return uitofp(hi) * 2^(32 - shamt);
3225 // }
3226 //
3227 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3228 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3229 // converted instead followed by negation based its sign bit.
3230
3231 SDLoc SL(Op);
3232 SDValue Src = Op.getOperand(0);
3233
3234 SDValue Lo, Hi;
3235 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3236 SDValue Sign;
3237 SDValue ShAmt;
3238 if (Signed && Subtarget->isGCN()) {
3239 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3240 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3241 // account. That is, the maximal shift is
3242 // - 32 if Lo and Hi have opposite signs;
3243 // - 33 if Lo and Hi have the same sign.
3244 //
3245 // Or, MaxShAmt = 33 + OppositeSign, where
3246 //
3247 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3248 // - -1 if Lo and Hi have opposite signs; and
3249 // - 0 otherwise.
3250 //
3251 // All in all, ShAmt is calculated as
3252 //
3253 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3254 //
3255 // or
3256 //
3257 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3258 //
3259 // to reduce the critical path.
3260 SDValue OppositeSign = DAG.getNode(
3261 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3262 DAG.getConstant(31, SL, MVT::i32));
3263 SDValue MaxShAmt =
3264 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3265 OppositeSign);
3266 // Count the leading sign bits.
3267 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3268 // Different from unsigned conversion, the shift should be one bit less to
3269 // preserve the sign bit.
3270 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3271 DAG.getConstant(1, SL, MVT::i32));
3272 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3273 } else {
3274 if (Signed) {
3275 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3276 // absolute value first.
3277 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3278 DAG.getConstant(63, SL, MVT::i64));
3279 SDValue Abs =
3280 DAG.getNode(ISD::XOR, SL, MVT::i64,
3281 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3282 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3283 }
3284 // Count the leading zeros.
3285 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3286 // The shift amount for signed integers is [0, 32].
3287 }
3288 // Normalize the given 64-bit integer.
3289 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3290 // Split it again.
3291 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3292 // Calculate the adjust bit for rounding.
3293 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3294 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3295 DAG.getConstant(1, SL, MVT::i32), Lo);
3296 // Get the 32-bit normalized integer.
3297 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3298 // Convert the normalized 32-bit integer into f32.
3299 unsigned Opc =
3300 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3301 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3302
3303 // Finally, need to scale back the converted floating number as the original
3304 // 64-bit integer is converted as a 32-bit one.
3305 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3306 ShAmt);
3307 // On GCN, use LDEXP directly.
3308 if (Subtarget->isGCN())
3309 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3310
3311 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3312 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3313 // exponent is enough to avoid overflowing into the sign bit.
3314 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3315 DAG.getConstant(23, SL, MVT::i32));
3316 SDValue IVal =
3317 DAG.getNode(ISD::ADD, SL, MVT::i32,
3318 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3319 if (Signed) {
3320 // Set the sign bit.
3321 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3322 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3323 DAG.getConstant(31, SL, MVT::i32));
3324 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3325 }
3326 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3327}
3328
3330 bool Signed) const {
3331 SDLoc SL(Op);
3332 SDValue Src = Op.getOperand(0);
3333
3334 SDValue Lo, Hi;
3335 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3336
3338 SL, MVT::f64, Hi);
3339
3340 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3341
3342 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3343 DAG.getConstant(32, SL, MVT::i32));
3344 // TODO: Should this propagate fast-math-flags?
3345 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3346}
3347
3349 SelectionDAG &DAG) const {
3350 // TODO: Factor out code common with LowerSINT_TO_FP.
3351 EVT DestVT = Op.getValueType();
3352 SDValue Src = Op.getOperand(0);
3353 EVT SrcVT = Src.getValueType();
3354
3355 if (SrcVT == MVT::i16) {
3356 if (DestVT == MVT::f16)
3357 return Op;
3358 SDLoc DL(Op);
3359
3360 // Promote src to i32
3361 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3362 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3363 }
3364
3365 if (DestVT == MVT::bf16) {
3366 SDLoc SL(Op);
3367 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3368 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3369 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3370 }
3371
3372 if (SrcVT != MVT::i64)
3373 return Op;
3374
3375 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3376 SDLoc DL(Op);
3377
3378 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3379 SDValue FPRoundFlag =
3380 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3381 SDValue FPRound =
3382 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3383
3384 return FPRound;
3385 }
3386
3387 if (DestVT == MVT::f32)
3388 return LowerINT_TO_FP32(Op, DAG, false);
3389
3390 assert(DestVT == MVT::f64);
3391 return LowerINT_TO_FP64(Op, DAG, false);
3392}
3393
3395 SelectionDAG &DAG) const {
3396 EVT DestVT = Op.getValueType();
3397
3398 SDValue Src = Op.getOperand(0);
3399 EVT SrcVT = Src.getValueType();
3400
3401 if (SrcVT == MVT::i16) {
3402 if (DestVT == MVT::f16)
3403 return Op;
3404
3405 SDLoc DL(Op);
3406 // Promote src to i32
3407 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3408 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3409 }
3410
3411 if (DestVT == MVT::bf16) {
3412 SDLoc SL(Op);
3413 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3414 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3415 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3416 }
3417
3418 if (SrcVT != MVT::i64)
3419 return Op;
3420
3421 // TODO: Factor out code common with LowerUINT_TO_FP.
3422
3423 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3424 SDLoc DL(Op);
3425 SDValue Src = Op.getOperand(0);
3426
3427 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3428 SDValue FPRoundFlag =
3429 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3430 SDValue FPRound =
3431 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3432
3433 return FPRound;
3434 }
3435
3436 if (DestVT == MVT::f32)
3437 return LowerINT_TO_FP32(Op, DAG, true);
3438
3439 assert(DestVT == MVT::f64);
3440 return LowerINT_TO_FP64(Op, DAG, true);
3441}
3442
3444 bool Signed) const {
3445 SDLoc SL(Op);
3446
3447 SDValue Src = Op.getOperand(0);
3448 EVT SrcVT = Src.getValueType();
3449
3450 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3451
3452 // The basic idea of converting a floating point number into a pair of 32-bit
3453 // integers is illustrated as follows:
3454 //
3455 // tf := trunc(val);
3456 // hif := floor(tf * 2^-32);
3457 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3458 // hi := fptoi(hif);
3459 // lo := fptoi(lof);
3460 //
3461 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3462 SDValue Sign;
3463 if (Signed && SrcVT == MVT::f32) {
3464 // However, a 32-bit floating point number has only 23 bits mantissa and
3465 // it's not enough to hold all the significant bits of `lof` if val is
3466 // negative. To avoid the loss of precision, We need to take the absolute
3467 // value after truncating and flip the result back based on the original
3468 // signedness.
3469 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3470 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3471 DAG.getConstant(31, SL, MVT::i32));
3472 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3473 }
3474
3475 SDValue K0, K1;
3476 if (SrcVT == MVT::f64) {
3477 K0 = DAG.getConstantFP(
3478 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3479 SrcVT);
3480 K1 = DAG.getConstantFP(
3481 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3482 SrcVT);
3483 } else {
3484 K0 = DAG.getConstantFP(
3485 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3486 K1 = DAG.getConstantFP(
3487 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3488 }
3489 // TODO: Should this propagate fast-math-flags?
3490 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3491
3492 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3493
3494 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3495
3496 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3498 SL, MVT::i32, FloorMul);
3499 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3500
3501 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3502 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3503
3504 if (Signed && SrcVT == MVT::f32) {
3505 assert(Sign);
3506 // Flip the result based on the signedness, which is either all 0s or 1s.
3507 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3508 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3509 // r := xor(r, sign) - sign;
3510 Result =
3511 DAG.getNode(ISD::SUB, SL, MVT::i64,
3512 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3513 }
3514
3515 return Result;
3516}
3517
3519 SDLoc DL(Op);
3520 SDValue N0 = Op.getOperand(0);
3521
3522 // Convert to target node to get known bits
3523 if (N0.getValueType() == MVT::f32)
3524 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3525
3526 if (getTargetMachine().Options.UnsafeFPMath) {
3527 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3528 return SDValue();
3529 }
3530
3531 assert(N0.getSimpleValueType() == MVT::f64);
3532
3533 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3534 const unsigned ExpMask = 0x7ff;
3535 const unsigned ExpBiasf64 = 1023;
3536 const unsigned ExpBiasf16 = 15;
3537 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3538 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3539 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3540 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3541 DAG.getConstant(32, DL, MVT::i64));
3542 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3543 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3544 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3545 DAG.getConstant(20, DL, MVT::i64));
3546 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3547 DAG.getConstant(ExpMask, DL, MVT::i32));
3548 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3549 // add the f16 bias (15) to get the biased exponent for the f16 format.
3550 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3551 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3552
3553 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3554 DAG.getConstant(8, DL, MVT::i32));
3555 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3556 DAG.getConstant(0xffe, DL, MVT::i32));
3557
3558 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3559 DAG.getConstant(0x1ff, DL, MVT::i32));
3560 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3561
3562 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3563 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3564
3565 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3566 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3567 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3568 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3569
3570 // N = M | (E << 12);
3571 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3572 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3573 DAG.getConstant(12, DL, MVT::i32)));
3574
3575 // B = clamp(1-E, 0, 13);
3576 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3577 One, E);
3578 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3579 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3580 DAG.getConstant(13, DL, MVT::i32));
3581
3582 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3583 DAG.getConstant(0x1000, DL, MVT::i32));
3584
3585 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3586 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3587 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3588 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3589
3590 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3591 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3592 DAG.getConstant(0x7, DL, MVT::i32));
3593 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3594 DAG.getConstant(2, DL, MVT::i32));
3595 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3596 One, Zero, ISD::SETEQ);
3597 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3598 One, Zero, ISD::SETGT);
3599 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3600 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3601
3602 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3603 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3604 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3605 I, V, ISD::SETEQ);
3606
3607 // Extract the sign bit.
3608 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3609 DAG.getConstant(16, DL, MVT::i32));
3610 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3611 DAG.getConstant(0x8000, DL, MVT::i32));
3612
3613 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3614 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3615}
3616
3618 SelectionDAG &DAG) const {
3619 SDValue Src = Op.getOperand(0);
3620 unsigned OpOpcode = Op.getOpcode();
3621 EVT SrcVT = Src.getValueType();
3622 EVT DestVT = Op.getValueType();
3623
3624 // Will be selected natively
3625 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3626 return Op;
3627
3628 if (SrcVT == MVT::bf16) {
3629 SDLoc DL(Op);
3630 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3631 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3632 }
3633
3634 // Promote i16 to i32
3635 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3636 SDLoc DL(Op);
3637
3638 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3639 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3640 }
3641
3642 if (DestVT != MVT::i64)
3643 return Op;
3644
3645 if (SrcVT == MVT::f16 ||
3646 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3647 SDLoc DL(Op);
3648
3649 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3650 unsigned Ext =
3652 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3653 }
3654
3655 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3656 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3657
3658 return SDValue();
3659}
3660
3662 SelectionDAG &DAG) const {
3663 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3664 MVT VT = Op.getSimpleValueType();
3665 MVT ScalarVT = VT.getScalarType();
3666
3667 assert(VT.isVector());
3668
3669 SDValue Src = Op.getOperand(0);
3670 SDLoc DL(Op);
3671
3672 // TODO: Don't scalarize on Evergreen?
3673 unsigned NElts = VT.getVectorNumElements();
3675 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3676
3677 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3678 for (unsigned I = 0; I < NElts; ++I)
3679 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3680
3681 return DAG.getBuildVector(VT, DL, Args);
3682}
3683
3684//===----------------------------------------------------------------------===//
3685// Custom DAG optimizations
3686//===----------------------------------------------------------------------===//
3687
3688static bool isU24(SDValue Op, SelectionDAG &DAG) {
3689 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3690}
3691
3692static bool isI24(SDValue Op, SelectionDAG &DAG) {
3693 EVT VT = Op.getValueType();
3694 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3695 // as unsigned 24-bit values.
3697}
3698
3701 SelectionDAG &DAG = DCI.DAG;
3702 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3703 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3704
3705 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3706 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3707 unsigned NewOpcode = Node24->getOpcode();
3708 if (IsIntrin) {
3709 unsigned IID = Node24->getConstantOperandVal(0);
3710 switch (IID) {
3711 case Intrinsic::amdgcn_mul_i24:
3712 NewOpcode = AMDGPUISD::MUL_I24;
3713 break;
3714 case Intrinsic::amdgcn_mul_u24:
3715 NewOpcode = AMDGPUISD::MUL_U24;
3716 break;
3717 case Intrinsic::amdgcn_mulhi_i24:
3718 NewOpcode = AMDGPUISD::MULHI_I24;
3719 break;
3720 case Intrinsic::amdgcn_mulhi_u24:
3721 NewOpcode = AMDGPUISD::MULHI_U24;
3722 break;
3723 default:
3724 llvm_unreachable("Expected 24-bit mul intrinsic");
3725 }
3726 }
3727
3728 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3729
3730 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3731 // the operands to have other uses, but will only perform simplifications that
3732 // involve bypassing some nodes for this user.
3733 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3734 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3735 if (DemandedLHS || DemandedRHS)
3736 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3737 DemandedLHS ? DemandedLHS : LHS,
3738 DemandedRHS ? DemandedRHS : RHS);
3739
3740 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3741 // operands if this node is the only user.
3742 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3743 return SDValue(Node24, 0);
3744 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3745 return SDValue(Node24, 0);
3746
3747 return SDValue();
3748}
3749
3750template <typename IntTy>
3752 uint32_t Width, const SDLoc &DL) {
3753 if (Width + Offset < 32) {
3754 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3755 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3756 return DAG.getConstant(Result, DL, MVT::i32);
3757 }
3758
3759 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3760}
3761
3762static bool hasVolatileUser(SDNode *Val) {
3763 for (SDNode *U : Val->uses()) {
3764 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3765 if (M->isVolatile())
3766 return true;
3767 }
3768 }
3769
3770 return false;
3771}
3772
3774 // i32 vectors are the canonical memory type.
3775 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3776 return false;
3777
3778 if (!VT.isByteSized())
3779 return false;
3780
3781 unsigned Size = VT.getStoreSize();
3782
3783 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3784 return false;
3785
3786 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3787 return false;
3788
3789 return true;
3790}
3791
3792// Replace load of an illegal type with a store of a bitcast to a friendlier
3793// type.
3795 DAGCombinerInfo &DCI) const {
3796 if (!DCI.isBeforeLegalize())
3797 return SDValue();
3798
3799 LoadSDNode *LN = cast<LoadSDNode>(N);
3800 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3801 return SDValue();
3802
3803 SDLoc SL(N);
3804 SelectionDAG &DAG = DCI.DAG;
3805 EVT VT = LN->getMemoryVT();
3806
3807 unsigned Size = VT.getStoreSize();
3808 Align Alignment = LN->getAlign();
3809 if (Alignment < Size && isTypeLegal(VT)) {
3810 unsigned IsFast;
3811 unsigned AS = LN->getAddressSpace();
3812
3813 // Expand unaligned loads earlier than legalization. Due to visitation order
3814 // problems during legalization, the emitted instructions to pack and unpack
3815 // the bytes again are not eliminated in the case of an unaligned copy.
3817 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3818 if (VT.isVector())
3819 return SplitVectorLoad(SDValue(LN, 0), DAG);
3820
3821 SDValue Ops[2];
3822 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3823
3824 return DAG.getMergeValues(Ops, SDLoc(N));
3825 }
3826
3827 if (!IsFast)
3828 return SDValue();
3829 }
3830
3831 if (!shouldCombineMemoryType(VT))
3832 return SDValue();
3833
3834 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3835
3836 SDValue NewLoad
3837 = DAG.getLoad(NewVT, SL, LN->getChain(),
3838 LN->getBasePtr(), LN->getMemOperand());
3839
3840 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3841 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3842 return SDValue(N, 0);
3843}
3844
3845// Replace store of an illegal type with a store of a bitcast to a friendlier
3846// type.
3848 DAGCombinerInfo &DCI) const {
3849 if (!DCI.isBeforeLegalize())
3850 return SDValue();
3851
3852 StoreSDNode *SN = cast<StoreSDNode>(N);
3853 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3854 return SDValue();
3855
3856 EVT VT = SN->getMemoryVT();
3857 unsigned Size = VT.getStoreSize();
3858
3859 SDLoc SL(N);
3860 SelectionDAG &DAG = DCI.DAG;
3861 Align Alignment = SN->getAlign();
3862 if (Alignment < Size && isTypeLegal(VT)) {
3863 unsigned IsFast;
3864 unsigned AS = SN->getAddressSpace();
3865
3866 // Expand unaligned stores earlier than legalization. Due to visitation
3867 // order problems during legalization, the emitted instructions to pack and
3868 // unpack the bytes again are not eliminated in the case of an unaligned
3869 // copy.
3871 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3872 if (VT.isVector())
3873 return SplitVectorStore(SDValue(SN, 0), DAG);
3874
3875 return expandUnalignedStore(SN, DAG);
3876 }
3877
3878 if (!IsFast)
3879 return SDValue();
3880 }
3881
3882 if (!shouldCombineMemoryType(VT))
3883 return SDValue();
3884
3885 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3886 SDValue Val = SN->getValue();
3887
3888 //DCI.AddToWorklist(Val.getNode());
3889
3890 bool OtherUses = !Val.hasOneUse();
3891 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3892 if (OtherUses) {
3893 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3894 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3895 }
3896
3897 return DAG.getStore(SN->getChain(), SL, CastVal,
3898 SN->getBasePtr(), SN->getMemOperand());
3899}
3900
3901// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3902// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3903// issues.
3905 DAGCombinerInfo &DCI) const {
3906 SelectionDAG &DAG = DCI.DAG;
3907 SDValue N0 = N->getOperand(0);
3908
3909 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3910 // (vt2 (truncate (assertzext vt0:x, vt1)))
3911 if (N0.getOpcode() == ISD::TRUNCATE) {
3912 SDValue N1 = N->getOperand(1);
3913 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3914 SDLoc SL(N);
3915
3916 SDValue Src = N0.getOperand(0);
3917 EVT SrcVT = Src.getValueType();
3918 if (SrcVT.bitsGE(ExtVT)) {
3919 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3920 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3921 }
3922 }
3923
3924 return SDValue();
3925}
3926
3928 SDNode *N, DAGCombinerInfo &DCI) const {
3929 unsigned IID = N->getConstantOperandVal(0);
3930 switch (IID) {
3931 case Intrinsic::amdgcn_mul_i24:
3932 case Intrinsic::amdgcn_mul_u24:
3933 case Intrinsic::amdgcn_mulhi_i24:
3934 case Intrinsic::amdgcn_mulhi_u24:
3935 return simplifyMul24(N, DCI);
3936 case Intrinsic::amdgcn_fract:
3937 case Intrinsic::amdgcn_rsq:
3938 case Intrinsic::amdgcn_rcp_legacy:
3939 case Intrinsic::amdgcn_rsq_legacy:
3940 case Intrinsic::amdgcn_rsq_clamp: {
3941 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3942 SDValue Src = N->getOperand(1);
3943 return Src.isUndef() ? Src : SDValue();
3944 }
3945 case Intrinsic::amdgcn_frexp_exp: {
3946 // frexp_exp (fneg x) -> frexp_exp x
3947 // frexp_exp (fabs x) -> frexp_exp x
3948 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3949 SDValue Src = N->getOperand(1);
3950 SDValue PeekSign = peekFPSignOps(Src);
3951 if (PeekSign == Src)
3952 return SDValue();
3953 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3954 0);
3955 }
3956 default:
3957 return SDValue();
3958 }
3959}
3960
3961/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3962/// binary operation \p Opc to it with the corresponding constant operands.
3964 DAGCombinerInfo &DCI, const SDLoc &SL,
3965 unsigned Opc, SDValue LHS,
3966 uint32_t ValLo, uint32_t ValHi) const {
3967 SelectionDAG &DAG = DCI.DAG;
3968 SDValue Lo, Hi;
3969 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3970
3971 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3972 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3973
3974 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3975 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3976
3977 // Re-visit the ands. It's possible we eliminated one of them and it could
3978 // simplify the vector.
3979 DCI.AddToWorklist(Lo.getNode());
3980 DCI.AddToWorklist(Hi.getNode());
3981
3982 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3983 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3984}
3985
3987 DAGCombinerInfo &DCI) const {
3988 EVT VT = N->getValueType(0);
3989
3990 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3991 if (!RHS)
3992 return SDValue();
3993
3994 SDValue LHS = N->getOperand(0);
3995 unsigned RHSVal = RHS->getZExtValue();
3996 if (!RHSVal)
3997 return LHS;
3998
3999 SDLoc SL(N);
4000 SelectionDAG &DAG = DCI.DAG;
4001
4002 switch (LHS->getOpcode()) {
4003 default:
4004 break;
4005 case ISD::ZERO_EXTEND:
4006 case ISD::SIGN_EXTEND:
4007 case ISD::ANY_EXTEND: {
4008 SDValue X = LHS->getOperand(0);
4009
4010 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4011 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4012 // Prefer build_vector as the canonical form if packed types are legal.
4013 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4014 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4015 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4016 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4017 }
4018
4019 // shl (ext x) => zext (shl x), if shift does not overflow int
4020 if (VT != MVT::i64)
4021 break;
4022 KnownBits Known = DAG.computeKnownBits(X);
4023 unsigned LZ = Known.countMinLeadingZeros();
4024 if (LZ < RHSVal)
4025 break;
4026 EVT XVT = X.getValueType();
4027 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4028 return DAG.getZExtOrTrunc(Shl, SL, VT);
4029 }
4030 }
4031
4032 if (VT != MVT::i64)
4033 return SDValue();
4034
4035 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4036
4037 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4038 // common case, splitting this into a move and a 32-bit shift is faster and
4039 // the same code size.
4040 if (RHSVal < 32)
4041 return SDValue();
4042
4043 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4044
4045 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4046 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4047
4048 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4049
4050 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4051 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4052}
4053
4055 DAGCombinerInfo &DCI) const {
4056 if (N->getValueType(0) != MVT::i64)
4057 return SDValue();
4058
4059 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4060 if (!RHS)
4061 return SDValue();
4062
4063 SelectionDAG &DAG = DCI.DAG;
4064 SDLoc SL(N);
4065 unsigned RHSVal = RHS->getZExtValue();
4066
4067 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4068 if (RHSVal == 32) {
4069 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4070 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4071 DAG.getConstant(31, SL, MVT::i32));
4072
4073 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4074 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4075 }
4076
4077 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4078 if (RHSVal == 63) {
4079 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4080 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4081 DAG.getConstant(31, SL, MVT::i32));
4082 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4083 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4084 }
4085
4086 return SDValue();
4087}
4088
4090 DAGCombinerInfo &DCI) const {
4091 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4092 if (!RHS)
4093 return SDValue();
4094
4095 EVT VT = N->getValueType(0);
4096 SDValue LHS = N->getOperand(0);
4097 unsigned ShiftAmt = RHS->getZExtValue();
4098 SelectionDAG &DAG = DCI.DAG;
4099 SDLoc SL(N);
4100
4101 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4102 // this improves the ability to match BFE patterns in isel.
4103 if (LHS.getOpcode() == ISD::AND) {
4104 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4105 unsigned MaskIdx, MaskLen;
4106 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4107 MaskIdx == ShiftAmt) {
4108 return DAG.getNode(
4109 ISD::AND, SL, VT,
4110 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4111 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4112 }
4113 }
4114 }
4115
4116 if (VT != MVT::i64)
4117 return SDValue();
4118
4119 if (ShiftAmt < 32)
4120 return SDValue();
4121
4122 // srl i64:x, C for C >= 32
4123 // =>
4124 // build_pair (srl hi_32(x), C - 32), 0
4125 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4126
4127 SDValue Hi = getHiHalf64(LHS, DAG);
4128
4129 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4130 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4131
4132 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4133
4134 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4135}
4136
4138 SDNode *N, DAGCombinerInfo &DCI) const {
4139 SDLoc SL(N);
4140 SelectionDAG &DAG = DCI.DAG;
4141 EVT VT = N->getValueType(0);
4142 SDValue Src = N->getOperand(0);
4143
4144 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4145 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4146 SDValue Vec = Src.getOperand(0);
4147 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4148 SDValue Elt0 = Vec.getOperand(0);
4149 EVT EltVT = Elt0.getValueType();
4150 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4151 if (EltVT.isFloatingPoint()) {
4152 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4153 EltVT.changeTypeToInteger(), Elt0);
4154 }
4155
4156 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4157 }
4158 }
4159 }
4160
4161 // Equivalent of above for accessing the high element of a vector as an
4162 // integer operation.
4163 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4164 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4165 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4166 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4167 SDValue BV = stripBitcast(Src.getOperand(0));
4168 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4169 BV.getValueType().getVectorNumElements() == 2) {
4170 SDValue SrcElt = BV.getOperand(1);
4171 EVT SrcEltVT = SrcElt.getValueType();
4172 if (SrcEltVT.isFloatingPoint()) {
4173 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4174 SrcEltVT.changeTypeToInteger(), SrcElt);
4175 }
4176
4177 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4178 }
4179 }
4180 }
4181 }
4182
4183 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4184 //
4185 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4186 // i16 (trunc (srl (i32 (trunc x), K)))
4187 if (VT.getScalarSizeInBits() < 32) {
4188 EVT SrcVT = Src.getValueType();
4189 if (SrcVT.getScalarSizeInBits() > 32 &&
4190 (Src.getOpcode() == ISD::SRL ||
4191 Src.getOpcode() == ISD::SRA ||
4192 Src.getOpcode() == ISD::SHL)) {
4193 SDValue Amt = Src.getOperand(1);
4194 KnownBits Known = DAG.computeKnownBits(Amt);
4195
4196 // - For left shifts, do the transform as long as the shift
4197 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4198 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4199 // losing information stored in the high bits when truncating.
4200 const unsigned MaxCstSize =
4201 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4202 if (Known.getMaxValue().ule(MaxCstSize)) {
4203 EVT MidVT = VT.isVector() ?
4204 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4205 VT.getVectorNumElements()) : MVT::i32;
4206
4207 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4208 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4209 Src.getOperand(0));
4210 DCI.AddToWorklist(Trunc.getNode());
4211
4212 if (Amt.getValueType() != NewShiftVT) {
4213 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4214 DCI.AddToWorklist(Amt.getNode());
4215 }
4216
4217 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4218 Trunc, Amt);
4219 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4220 }
4221 }
4222 }
4223
4224 return SDValue();
4225}
4226
4227// We need to specifically handle i64 mul here to avoid unnecessary conversion
4228// instructions. If we only match on the legalized i64 mul expansion,
4229// SimplifyDemandedBits will be unable to remove them because there will be
4230// multiple uses due to the separate mul + mulh[su].
4231static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4232 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4233 if (Size <= 32) {
4234 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4235 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4236 }
4237
4238 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4239 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4240
4241 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4242 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4243
4244 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4245}
4246
4247/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4248/// return SDValue().
4249static SDValue getAddOneOp(const SDNode *V) {
4250 if (V->getOpcode() != ISD::ADD)
4251 return SDValue();
4252
4253 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4254}
4255
4257 DAGCombinerInfo &DCI) const {
4258 assert(N->getOpcode() == ISD::MUL);
4259 EVT VT = N->getValueType(0);
4260
4261 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4262 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4263 // unnecessarily). isDivergent() is used as an approximation of whether the
4264 // value is in an SGPR.
4265 if (!N->isDivergent())
4266 return SDValue();
4267
4268 unsigned Size = VT.getSizeInBits();
4269 if (VT.isVector() || Size > 64)
4270 return SDValue();
4271
4272 SelectionDAG &DAG = DCI.DAG;
4273 SDLoc DL(N);
4274
4275 SDValue N0 = N->getOperand(0);
4276 SDValue N1 = N->getOperand(1);
4277
4278 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4279 // matching.
4280
4281 // mul x, (add y, 1) -> add (mul x, y), x
4282 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4283 SDValue AddOp = getAddOneOp(V.getNode());
4284 if (!AddOp)
4285 return SDValue();
4286
4287 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4288 return U->getOpcode() == ISD::MUL;
4289 }))
4290 return AddOp;
4291
4292 return SDValue();
4293 };
4294
4295 // FIXME: The selection pattern is not properly checking for commuted
4296 // operands, so we have to place the mul in the LHS
4297 if (SDValue MulOper = IsFoldableAdd(N0)) {
4298 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4299 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4300 }
4301
4302 if (SDValue MulOper = IsFoldableAdd(N1)) {
4303 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4304 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4305 }
4306
4307 // There are i16 integer mul/mad.
4308 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4309 return SDValue();
4310
4311 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4312 // in the source into any_extends if the result of the mul is truncated. Since
4313 // we can assume the high bits are whatever we want, use the underlying value
4314 // to avoid the unknown high bits from interfering.
4315 if (N0.getOpcode() == ISD::ANY_EXTEND)
4316 N0 = N0.getOperand(0);
4317
4318 if (N1.getOpcode() == ISD::ANY_EXTEND)
4319 N1 = N1.getOperand(0);
4320
4321 SDValue Mul;
4322
4323 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4324 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4325 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4326 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4327 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4328 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4329 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4330 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4331 } else {
4332 return SDValue();
4333 }
4334
4335 // We need to use sext even for MUL_U24, because MUL_U24 is used
4336 // for signed multiply of 8 and 16-bit types.
4337 return DAG.getSExtOrTrunc(Mul, DL, VT);
4338}
4339
4340SDValue
4342 DAGCombinerInfo &DCI) const {
4343 if (N->getValueType(0) != MVT::i32)
4344 return SDValue();
4345
4346 SelectionDAG &DAG = DCI.DAG;
4347 SDLoc DL(N);
4348
4349 SDValue N0 = N->getOperand(0);
4350 SDValue N1 = N->getOperand(1);
4351
4352 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4353 // in the source into any_extends if the result of the mul is truncated. Since
4354 // we can assume the high bits are whatever we want, use the underlying value
4355 // to avoid the unknown high bits from interfering.
4356 if (N0.getOpcode() == ISD::ANY_EXTEND)
4357 N0 = N0.getOperand(0);
4358 if (N1.getOpcode() == ISD::ANY_EXTEND)
4359 N1 = N1.getOperand(0);
4360
4361 // Try to use two fast 24-bit multiplies (one for each half of the result)
4362 // instead of one slow extending multiply.
4363 unsigned LoOpcode, HiOpcode;
4364 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4365 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4366 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4367 LoOpcode = AMDGPUISD::MUL_U24;
4368 HiOpcode = AMDGPUISD::MULHI_U24;
4369 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4370 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4371 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4372 LoOpcode = AMDGPUISD::MUL_I24;
4373 HiOpcode = AMDGPUISD::MULHI_I24;
4374 } else {
4375 return SDValue();
4376 }
4377
4378 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4379 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4380 DCI.CombineTo(N, Lo, Hi);
4381 return SDValue(N, 0);
4382}
4383
4385 DAGCombinerInfo &DCI) const {
4386 EVT VT = N->getValueType(0);
4387
4388 if (!Subtarget->hasMulI24() || VT.isVector())
4389 return SDValue();
4390
4391 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4392 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4393 // unnecessarily). isDivergent() is used as an approximation of whether the
4394 // value is in an SGPR.
4395 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4396 // valu op anyway)
4397 if (Subtarget->hasSMulHi() && !N->isDivergent())
4398 return SDValue();
4399
4400 SelectionDAG &DAG = DCI.DAG;
4401 SDLoc DL(N);
4402
4403 SDValue N0 = N->getOperand(0);
4404 SDValue N1 = N->getOperand(1);
4405
4406 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4407 return SDValue();
4408
4409 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4410 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4411
4412 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4413 DCI.AddToWorklist(Mulhi.getNode());
4414 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4415}
4416
4418 DAGCombinerInfo &DCI) const {
4419 EVT VT = N->getValueType(0);
4420
4421 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4422 return SDValue();
4423
4424 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4425 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4426 // unnecessarily). isDivergent() is used as an approximation of whether the
4427 // value is in an SGPR.
4428 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4429 // valu op anyway)
4430 if (Subtarget->hasSMulHi() && !N->isDivergent())
4431 return SDValue();
4432
4433 SelectionDAG &DAG = DCI.DAG;
4434 SDLoc DL(N);
4435
4436 SDValue N0 = N->getOperand(0);
4437 SDValue N1 = N->getOperand(1);
4438
4439 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4440 return SDValue();
4441
4442 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4443 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4444
4445 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4446 DCI.AddToWorklist(Mulhi.getNode());
4447 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4448}
4449
4450SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4451 SDValue Op,
4452 const SDLoc &DL,
4453 unsigned Opc) const {
4454 EVT VT = Op.getValueType();
4455 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4456 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4457 LegalVT != MVT::i16))
4458 return SDValue();
4459
4460 if (VT != MVT::i32)
4461 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4462
4463 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4464 if (VT != MVT::i32)
4465 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4466
4467 return FFBX;
4468}
4469
4470// The native instructions return -1 on 0 input. Optimize out a select that
4471// produces -1 on 0.
4472//
4473// TODO: If zero is not undef, we could also do this if the output is compared
4474// against the bitwidth.
4475//
4476// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4478 SDValue LHS, SDValue RHS,
4479 DAGCombinerInfo &DCI) const {
4480 if (!isNullConstant(Cond.getOperand(1)))
4481 return SDValue();
4482
4483 SelectionDAG &DAG = DCI.DAG;
4484 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4485 SDValue CmpLHS = Cond.getOperand(0);
4486
4487 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4488 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4489 if (CCOpcode == ISD::SETEQ &&
4490 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4491 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4492 unsigned Opc =
4494 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4495 }
4496
4497 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4498 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4499 if (CCOpcode == ISD::SETNE &&
4500 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4501 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4502 unsigned Opc =
4504
4505 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4506 }
4507
4508 return SDValue();
4509}
4510
4512 unsigned Op,
4513 const SDLoc &SL,
4514 SDValue Cond,
4515 SDValue N1,
4516 SDValue N2) {
4517 SelectionDAG &DAG = DCI.DAG;
4518 EVT VT = N1.getValueType();
4519
4520 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4521 N1.getOperand(0), N2.getOperand(0));
4522 DCI.AddToWorklist(NewSelect.getNode());
4523 return DAG.getNode(Op, SL, VT, NewSelect);
4524}
4525
4526// Pull a free FP operation out of a select so it may fold into uses.
4527//
4528// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4529// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4530//
4531// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4532// select c, (fabs x), +k -> fabs (select c, x, k)
4533SDValue
4535 SDValue N) const {
4536 SelectionDAG &DAG = DCI.DAG;
4537 SDValue Cond = N.getOperand(0);
4538 SDValue LHS = N.getOperand(1);
4539 SDValue RHS = N.getOperand(2);
4540
4541 EVT VT = N.getValueType();
4542 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4543 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4545 return SDValue();
4546
4547 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4548 SDLoc(N), Cond, LHS, RHS);
4549 }
4550
4551 bool Inv = false;
4552 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4553 std::swap(LHS, RHS);
4554 Inv = true;
4555 }
4556
4557 // TODO: Support vector constants.
4558 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4559 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4560 !selectSupportsSourceMods(N.getNode())) {
4561 SDLoc SL(N);
4562 // If one side is an fneg/fabs and the other is a constant, we can push the
4563 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4564 SDValue NewLHS = LHS.getOperand(0);
4565 SDValue NewRHS = RHS;
4566
4567 // Careful: if the neg can be folded up, don't try to pull it back down.
4568 bool ShouldFoldNeg = true;
4569
4570 if (NewLHS.hasOneUse()) {
4571 unsigned Opc = NewLHS.getOpcode();
4572 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4573 ShouldFoldNeg = false;
4574 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4575 ShouldFoldNeg = false;
4576 }
4577
4578 if (ShouldFoldNeg) {
4579 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4580 return SDValue();
4581
4582 // We're going to be forced to use a source modifier anyway, there's no
4583 // point to pulling the negate out unless we can get a size reduction by
4584 // negating the constant.
4585 //
4586 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4587 // about cheaper constants.
4588 if (NewLHS.getOpcode() == ISD::FABS &&
4590 return SDValue();
4591
4593 return SDValue();
4594
4595 if (LHS.getOpcode() == ISD::FNEG)
4596 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4597
4598 if (Inv)
4599 std::swap(NewLHS, NewRHS);
4600
4601 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4602 Cond, NewLHS, NewRHS);
4603 DCI.AddToWorklist(NewSelect.getNode());
4604 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4605 }
4606 }
4607
4608 return SDValue();
4609}
4610
4612 DAGCombinerInfo &DCI) const {
4613 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4614 return Folded;
4615
4616 SDValue Cond = N->getOperand(0);
4617 if (Cond.getOpcode() != ISD::SETCC)
4618 return SDValue();
4619
4620 EVT VT = N->getValueType(0);
4621 SDValue LHS = Cond.getOperand(0);
4622 SDValue RHS = Cond.getOperand(1);
4623 SDValue CC = Cond.getOperand(2);
4624
4625 SDValue True = N->getOperand(1);
4626 SDValue False = N->getOperand(2);
4627
4628 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4629 SelectionDAG &DAG = DCI.DAG;
4630 if (DAG.isConstantValueOfAnyType(True) &&
4631 !DAG.isConstantValueOfAnyType(False)) {
4632 // Swap cmp + select pair to move constant to false input.
4633 // This will allow using VOPC cndmasks more often.
4634 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4635
4636 SDLoc SL(N);
4637 ISD::CondCode NewCC =
4638 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4639
4640 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4641 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4642 }
4643
4644 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4646 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4647 // Revisit this node so we can catch min3/max3/med3 patterns.
4648 //DCI.AddToWorklist(MinMax.getNode());
4649 return MinMax;
4650 }
4651 }
4652
4653 // There's no reason to not do this if the condition has other uses.
4654 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4655}
4656
4657static bool isInv2Pi(const APFloat &APF) {
4658 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4659 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4660 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4661
4662 return APF.bitwiseIsEqual(KF16) ||
4663 APF.bitwiseIsEqual(KF32) ||
4664 APF.bitwiseIsEqual(KF64);
4665}
4666
4667// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4668// additional cost to negate them.
4671 if (C->isZero())
4672 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4673
4674 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4675 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4676
4678}
4679
4683 return false;
4684}
4685
4689 return false;
4690}
4691
4692static unsigned inverseMinMax(unsigned Opc) {
4693 switch (Opc) {
4694 case ISD::FMAXNUM:
4695 return ISD::FMINNUM;
4696 case ISD::FMINNUM:
4697 return ISD::FMAXNUM;
4698 case ISD::FMAXNUM_IEEE:
4699 return ISD::FMINNUM_IEEE;
4700 case ISD::FMINNUM_IEEE:
4701 return ISD::FMAXNUM_IEEE;
4702 case ISD::FMAXIMUM:
4703 return ISD::FMINIMUM;
4704 case ISD::FMINIMUM:
4705 return ISD::FMAXIMUM;
4710 default:
4711 llvm_unreachable("invalid min/max opcode");
4712 }
4713}
4714
4715/// \return true if it's profitable to try to push an fneg into its source
4716/// instruction.
4718 // If the input has multiple uses and we can either fold the negate down, or
4719 // the other uses cannot, give up. This both prevents unprofitable
4720 // transformations and infinite loops: we won't repeatedly try to fold around
4721 // a negate that has no 'good' form.
4722 if (N0.hasOneUse()) {
4723 // This may be able to fold into the source, but at a code size cost. Don't
4724 // fold if the fold into the user is free.
4725 if (allUsesHaveSourceMods(N, 0))
4726 return false;
4727 } else {
4728 if (fnegFoldsIntoOp(N0.getNode()) &&
4730 return false;
4731 }
4732
4733 return true;
4734}
4735
4737 DAGCombinerInfo &DCI) const {
4738 SelectionDAG &DAG = DCI.DAG;
4739 SDValue N0 = N->getOperand(0);
4740 EVT VT = N->getValueType(0);
4741
4742 unsigned Opc = N0.getOpcode();
4743
4744 if (!shouldFoldFNegIntoSrc(N, N0))
4745 return SDValue();
4746
4747 SDLoc SL(N);
4748 switch (Opc) {
4749 case ISD::FADD: {
4750 if (!mayIgnoreSignedZero(N0))
4751 return SDValue();
4752
4753 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4754 SDValue LHS = N0.getOperand(0);
4755 SDValue RHS = N0.getOperand(1);
4756
4757 if (LHS.getOpcode() != ISD::FNEG)
4758 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4759 else
4760 LHS = LHS.getOperand(0);
4761
4762 if (RHS.getOpcode() != ISD::FNEG)
4763 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4764 else
4765 RHS = RHS.getOperand(0);
4766
4767 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4768 if (Res.getOpcode() != ISD::FADD)
4769 return SDValue(); // Op got folded away.
4770 if (!N0.hasOneUse())
4771 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4772 return Res;
4773 }
4774 case ISD::FMUL:
4776 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4777 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4778 SDValue LHS = N0.getOperand(0);
4779 SDValue RHS = N0.getOperand(1);
4780
4781 if (LHS.getOpcode() == ISD::FNEG)
4782 LHS = LHS.getOperand(0);
4783 else if (RHS.getOpcode() == ISD::FNEG)
4784 RHS = RHS.getOperand(0);
4785 else
4786 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4787
4788 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4789 if (Res.getOpcode() != Opc)
4790 return SDValue(); // Op got folded away.
4791 if (!N0.hasOneUse())
4792 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4793 return Res;
4794 }
4795 case ISD::FMA:
4796 case ISD::FMAD: {
4797 // TODO: handle llvm.amdgcn.fma.legacy
4798 if (!mayIgnoreSignedZero(N0))
4799 return SDValue();
4800
4801 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4802 SDValue LHS = N0.getOperand(0);
4803 SDValue MHS = N0.getOperand(1);
4804 SDValue RHS = N0.getOperand(2);
4805
4806 if (LHS.getOpcode() == ISD::FNEG)
4807 LHS = LHS.getOperand(0);
4808 else if (MHS.getOpcode() == ISD::FNEG)
4809 MHS = MHS.getOperand(0);
4810 else
4811 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4812
4813 if (RHS.getOpcode() != ISD::FNEG)
4814 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4815 else
4816 RHS = RHS.getOperand(0);
4817
4818 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4819 if (Res.getOpcode() != Opc)
4820 return SDValue(); // Op got folded away.
4821 if (!N0.hasOneUse())
4822 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4823 return Res;
4824 }
4825 case ISD::FMAXNUM:
4826 case ISD::FMINNUM:
4827 case ISD::FMAXNUM_IEEE:
4828 case ISD::FMINNUM_IEEE:
4829 case ISD::FMINIMUM:
4830 case ISD::FMAXIMUM:
4833 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4834 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4835 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4836 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4837
4838 SDValue LHS = N0.getOperand(0);
4839 SDValue RHS = N0.getOperand(1);
4840
4841 // 0 doesn't have a negated inline immediate.
4842 // TODO: This constant check should be generalized to other operations.
4844 return SDValue();
4845
4846 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4847 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4848 unsigned Opposite = inverseMinMax(Opc);
4849
4850 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4851 if (Res.getOpcode() != Opposite)
4852 return SDValue(); // Op got folded away.
4853 if (!N0.hasOneUse())
4854 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4855 return Res;
4856 }
4857 case AMDGPUISD::FMED3: {
4858 SDValue Ops[3];
4859 for (unsigned I = 0; I < 3; ++I)
4860 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4861
4862 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4863 if (Res.getOpcode() != AMDGPUISD::FMED3)
4864 return SDValue(); // Op got folded away.
4865
4866 if (!N0.hasOneUse()) {
4867 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4868 DAG.ReplaceAllUsesWith(N0, Neg);
4869
4870 for (SDNode *U : Neg->uses())
4871 DCI.AddToWorklist(U);
4872 }
4873
4874 return Res;
4875 }
4876 case ISD::FP_EXTEND:
4877 case ISD::FTRUNC:
4878 case ISD::FRINT:
4879 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4880 case ISD::FROUNDEVEN:
4881 case ISD::FSIN:
4882 case ISD::FCANONICALIZE:
4883 case AMDGPUISD::RCP:
4886 case AMDGPUISD::SIN_HW: {
4887 SDValue CvtSrc = N0.getOperand(0);
4888 if (CvtSrc.getOpcode() == ISD::FNEG) {
4889 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4890 // (fneg (rcp (fneg x))) -> (rcp x)
4891 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4892 }
4893
4894 if (!N0.hasOneUse())
4895 return SDValue();
4896
4897 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4898 // (fneg (rcp x)) -> (rcp (fneg x))
4899 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4900 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4901 }
4902 case ISD::FP_ROUND: {
4903 SDValue CvtSrc = N0.getOperand(0);
4904
4905 if (CvtSrc.getOpcode() == ISD::FNEG) {
4906 // (fneg (fp_round (fneg x))) -> (fp_round x)
4907 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4908 CvtSrc.getOperand(0), N0.getOperand(1));
4909 }
4910
4911 if (!N0.hasOneUse())
4912 return SDValue();
4913
4914 // (fneg (fp_round x)) -> (fp_round (fneg x))
4915 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4916 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4917 }
4918 case ISD::FP16_TO_FP: {
4919 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4920 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4921 // Put the fneg back as a legal source operation that can be matched later.
4922 SDLoc SL(N);
4923
4924 SDValue Src = N0.getOperand(0);
4925 EVT SrcVT = Src.getValueType();
4926
4927 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4928 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4929 DAG.getConstant(0x8000, SL, SrcVT));
4930 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4931 }
4932 case ISD::SELECT: {
4933 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4934 // TODO: Invert conditions of foldFreeOpFromSelect
4935 return SDValue();
4936 }
4937 case ISD::BITCAST: {
4938 SDLoc SL(N);
4939 SDValue BCSrc = N0.getOperand(0);
4940 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4941 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4942 if (HighBits.getValueType().getSizeInBits() != 32 ||
4943 !fnegFoldsIntoOp(HighBits.getNode()))
4944 return SDValue();
4945
4946 // f64 fneg only really needs to operate on the high half of of the
4947 // register, so try to force it to an f32 operation to help make use of
4948 // source modifiers.
4949 //
4950 //
4951 // fneg (f64 (bitcast (build_vector x, y))) ->
4952 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4953 // (fneg (bitcast i32:y to f32)))
4954
4955 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4956 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4957 SDValue CastBack =
4958 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4959
4960 SmallVector<SDValue, 8> Ops(BCSrc->ops());
4961 Ops.back() = CastBack;
4962 DCI.AddToWorklist(NegHi.getNode());
4963 SDValue Build =
4964 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4965 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4966
4967 if (!N0.hasOneUse())
4968 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4969 return Result;
4970 }
4971
4972 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4973 BCSrc.hasOneUse()) {
4974 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4975 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4976
4977 // TODO: Cast back result for multiple uses is beneficial in some cases.
4978
4979 SDValue LHS =
4980 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4981 SDValue RHS =
4982 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4983
4984 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4985 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4986
4987 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4988 NegRHS);
4989 }
4990
4991 return SDValue();
4992 }
4993 default:
4994 return SDValue();
4995 }
4996}
4997
4999 DAGCombinerInfo &DCI) const {
5000 SelectionDAG &DAG = DCI.DAG;
5001 SDValue N0 = N->getOperand(0);
5002
5003 if (!N0.hasOneUse())
5004 return SDValue();
5005
5006 switch (N0.getOpcode()) {
5007 case ISD::FP16_TO_FP: {
5008 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5009 SDLoc SL(N);
5010 SDValue Src = N0.getOperand(0);
5011 EVT SrcVT = Src.getValueType();
5012
5013 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5014 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5015 DAG.getConstant(0x7fff, SL, SrcVT));
5016 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5017 }
5018 default:
5019 return SDValue();
5020 }
5021}
5022
5024 DAGCombinerInfo &DCI) const {
5025 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5026 if (!CFP)
5027 return SDValue();
5028
5029 // XXX - Should this flush denormals?
5030 const APFloat &Val = CFP->getValueAPF();
5031 APFloat One(Val.getSemantics(), "1.0");
5032 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5033}
5034
5036 DAGCombinerInfo &DCI) const {
5037 SelectionDAG &DAG = DCI.DAG;
5038 SDLoc DL(N);
5039
5040 switch(N->getOpcode()) {
5041 default:
5042 break;
5043 case ISD::BITCAST: {
5044 EVT DestVT = N->getValueType(0);
5045
5046 // Push casts through vector builds. This helps avoid emitting a large
5047 // number of copies when materializing floating point vector constants.
5048 //
5049 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5050 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5051 if (DestVT.isVector()) {
5052 SDValue Src = N->getOperand(0);
5053 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5056 EVT SrcVT = Src.getValueType();
5057 unsigned NElts = DestVT.getVectorNumElements();
5058
5059 if (SrcVT.getVectorNumElements() == NElts) {
5060 EVT DestEltVT = DestVT.getVectorElementType();
5061
5062 SmallVector<SDValue, 8> CastedElts;
5063 SDLoc SL(N);
5064 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5065 SDValue Elt = Src.getOperand(I);
5066 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5067 }
5068
5069 return DAG.getBuildVector(DestVT, SL, CastedElts);
5070 }
5071 }
5072 }
5073
5074 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5075 break;
5076
5077 // Fold bitcasts of constants.
5078 //
5079 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5080 // TODO: Generalize and move to DAGCombiner
5081 SDValue Src = N->getOperand(0);
5082 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5083 SDLoc SL(N);
5084 uint64_t CVal = C->getZExtValue();
5085 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5086 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5087 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5088 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5089 }
5090
5091 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5092 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5093 SDLoc SL(N);
5094 uint64_t CVal = Val.getZExtValue();
5095 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5096 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5097 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5098
5099 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5100 }
5101
5102 break;
5103 }
5104 case ISD::SHL: {
5106 break;
5107
5108 return performShlCombine(N, DCI);
5109 }
5110 case ISD::SRL: {
5112 break;
5113
5114 return performSrlCombine(N, DCI);
5115 }
5116 case ISD::SRA: {
5118 break;
5119
5120 return performSraCombine(N, DCI);
5121 }
5122 case ISD::TRUNCATE:
5123 return performTruncateCombine(N, DCI);
5124 case ISD::MUL:
5125 return performMulCombine(N, DCI);
5126 case AMDGPUISD::MUL_U24:
5127 case AMDGPUISD::MUL_I24: {
5128 if (SDValue Simplified = simplifyMul24(N, DCI))
5129 return Simplified;
5130 break;
5131 }
5134 return simplifyMul24(N, DCI);
5135 case ISD::SMUL_LOHI:
5136 case ISD::UMUL_LOHI:
5137 return performMulLoHiCombine(N, DCI);
5138 case ISD::MULHS:
5139 return performMulhsCombine(N, DCI);
5140 case ISD::MULHU:
5141 return performMulhuCombine(N, DCI);
5142 case ISD::SELECT:
5143 return performSelectCombine(N, DCI);
5144 case ISD::FNEG:
5145 return performFNegCombine(N, DCI);
5146 case ISD::FABS:
5147 return performFAbsCombine(N, DCI);
5148 case AMDGPUISD::BFE_I32:
5149 case AMDGPUISD::BFE_U32: {
5150 assert(!N->getValueType(0).isVector() &&
5151 "Vector handling of BFE not implemented");
5152 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5153 if (!Width)
5154 break;
5155
5156 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5157 if (WidthVal == 0)
5158 return DAG.getConstant(0, DL, MVT::i32);
5159
5160 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5161 if (!Offset)
5162 break;
5163
5164 SDValue BitsFrom = N->getOperand(0);
5165 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5166
5167 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5168
5169 if (OffsetVal == 0) {
5170 // This is already sign / zero extended, so try to fold away extra BFEs.
5171 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5172
5173 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5174 if (OpSignBits >= SignBits)
5175 return BitsFrom;
5176
5177 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5178 if (Signed) {
5179 // This is a sign_extend_inreg. Replace it to take advantage of existing
5180 // DAG Combines. If not eliminated, we will match back to BFE during
5181 // selection.
5182
5183 // TODO: The sext_inreg of extended types ends, although we can could
5184 // handle them in a single BFE.
5185 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5186 DAG.getValueType(SmallVT));
5187 }
5188
5189 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5190 }
5191
5192 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5193 if (Signed) {
5194 return constantFoldBFE<int32_t>(DAG,
5195 CVal->getSExtValue(),
5196 OffsetVal,
5197 WidthVal,
5198 DL);
5199 }
5200
5201 return constantFoldBFE<uint32_t>(DAG,
5202 CVal->getZExtValue(),
5203 OffsetVal,
5204 WidthVal,
5205 DL);
5206 }
5207
5208 if ((OffsetVal + WidthVal) >= 32 &&
5209 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5210 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5211 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5212 BitsFrom, ShiftVal);
5213 }
5214
5215 if (BitsFrom.hasOneUse()) {
5216 APInt Demanded = APInt::getBitsSet(32,
5217 OffsetVal,
5218 OffsetVal + WidthVal);
5219
5220 KnownBits Known;
5222 !DCI.isBeforeLegalizeOps());
5223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5224 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5225 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5226 DCI.CommitTargetLoweringOpt(TLO);
5227 }
5228 }
5229
5230 break;
5231 }
5232 case ISD::LOAD:
5233 return performLoadCombine(N, DCI);
5234 case ISD::STORE:
5235 return performStoreCombine(N, DCI);
5236 case AMDGPUISD::RCP:
5238 return performRcpCombine(N, DCI);
5239 case ISD::AssertZext:
5240 case ISD::AssertSext:
5241 return performAssertSZExtCombine(N, DCI);
5243 return performIntrinsicWOChainCombine(N, DCI);
5244 case AMDGPUISD::FMAD_FTZ: {
5245 SDValue N0 = N->getOperand(0);
5246 SDValue N1 = N->getOperand(1);
5247 SDValue N2 = N->getOperand(2);
5248 EVT VT = N->getValueType(0);
5249
5250 // FMAD_FTZ is a FMAD + flush denormals to zero.
5251 // We flush the inputs, the intermediate step, and the output.
5252 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5253 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5254 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5255 if (N0CFP && N1CFP && N2CFP) {
5256 const auto FTZ = [](const APFloat &V) {
5257 if (V.isDenormal()) {
5258 APFloat Zero(V.getSemantics(), 0);
5259 return V.isNegative() ? -Zero : Zero;
5260 }
5261 return V;
5262 };
5263
5264 APFloat V0 = FTZ(N0CFP->getValueAPF());
5265 APFloat V1 = FTZ(N1CFP->getValueAPF());
5266 APFloat V2 = FTZ(N2CFP->getValueAPF());
5268 V0 = FTZ(V0);
5270 return DAG.getConstantFP(FTZ(V0), DL, VT);
5271 }
5272 break;
5273 }
5274 }
5275 return SDValue();
5276}
5277
5278//===----------------------------------------------------------------------===//
5279// Helper functions
5280//===----------------------------------------------------------------------===//
5281
5283 const TargetRegisterClass *RC,
5284 Register Reg, EVT VT,
5285 const SDLoc &SL,
5286 bool RawReg) const {
5289 Register VReg;
5290
5291 if (!MRI.isLiveIn(Reg)) {
5292 VReg = MRI.createVirtualRegister(RC);
5293 MRI.addLiveIn(Reg, VReg);
5294 } else {
5295 VReg = MRI.getLiveInVirtReg(Reg);
5296 }
5297
5298 if (RawReg)
5299 return DAG.getRegister(VReg, VT);
5300
5301 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5302}
5303
5304// This may be called multiple times, and nothing prevents creating multiple
5305// objects at the same offset. See if we already defined this object.
5307 int64_t Offset) {
5308 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5309 if (MFI.getObjectOffset(I) == Offset) {
5310 assert(MFI.getObjectSize(I) == Size);
5311 return I;
5312 }
5313 }
5314
5315 return MFI.CreateFixedObject(Size, Offset, true);
5316}
5317
5319 EVT VT,
5320 const SDLoc &SL,
5321 int64_t Offset) const {
5323 MachineFrameInfo &MFI = MF.getFrameInfo();
5324 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5325
5326 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5327 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5328
5329 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5332}
5333
5335 const SDLoc &SL,
5336 SDValue Chain,
5337 SDValue ArgVal,
5338 int64_t Offset) const {
5342
5343 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5344 // Stores to the argument stack area are relative to the stack pointer.
5345 SDValue SP =
5346 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5347 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5348 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5350 return Store;
5351}
5352
5354 const TargetRegisterClass *RC,
5355 EVT VT, const SDLoc &SL,
5356 const ArgDescriptor &Arg) const {
5357 assert(Arg && "Attempting to load missing argument");
5358
5359 SDValue V = Arg.isRegister() ?
5360 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5361 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5362
5363 if (!Arg.isMasked())
5364 return V;
5365
5366 unsigned Mask = Arg.getMask();
5367 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5368 V = DAG.getNode(ISD::SRL, SL, VT, V,
5369 DAG.getShiftAmountConstant(Shift, VT, SL));
5370 return DAG.getNode(ISD::AND, SL, VT, V,
5371 DAG.getConstant(Mask >> Shift, SL, VT));
5372}
5373
5375 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5376 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5377 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5378 uint64_t ArgOffset =
5379 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5380 switch (Param) {
5381 case FIRST_IMPLICIT:
5382 return ArgOffset;
5383 case PRIVATE_BASE:
5385 case SHARED_BASE:
5386 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5387 case QUEUE_PTR:
5388 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5389 }
5390 llvm_unreachable("unexpected implicit parameter type");
5391}
5392
5394 const MachineFunction &MF, const ImplicitParameter Param) const {
5397}
5398
5399#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5400
5401const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5402 switch ((AMDGPUISD::NodeType)Opcode) {
5403 case AMDGPUISD::FIRST_NUMBER: break;
5404 // AMDIL DAG nodes
5405 NODE_NAME_CASE(UMUL);
5406 NODE_NAME_CASE(BRANCH_COND);
5407
5408 // AMDGPU DAG nodes
5409 NODE_NAME_CASE(IF)
5410 NODE_NAME_CASE(ELSE)
5411 NODE_NAME_CASE(LOOP)
5412 NODE_NAME_CASE(CALL)
5413 NODE_NAME_CASE(TC_RETURN)
5414 NODE_NAME_CASE(TC_RETURN_GFX)
5415 NODE_NAME_CASE(TC_RETURN_CHAIN)
5416 NODE_NAME_CASE(TRAP)
5417 NODE_NAME_CASE(RET_GLUE)
5418 NODE_NAME_CASE(WAVE_ADDRESS)
5419 NODE_NAME_CASE(RETURN_TO_EPILOG)
5420 NODE_NAME_CASE(ENDPGM)
5421 NODE_NAME_CASE(ENDPGM_TRAP)
5422 NODE_NAME_CASE(SIMULATED_TRAP)
5423 NODE_NAME_CASE(DWORDADDR)
5424 NODE_NAME_CASE(FRACT)
5425 NODE_NAME_CASE(SETCC)
5426 NODE_NAME_CASE(SETREG)
5427 NODE_NAME_CASE(DENORM_MODE)
5428 NODE_NAME_CASE(FMA_W_CHAIN)
5429 NODE_NAME_CASE(FMUL_W_CHAIN)
5430 NODE_NAME_CASE(CLAMP)
5431 NODE_NAME_CASE(COS_HW)
5432 NODE_NAME_CASE(SIN_HW)
5433 NODE_NAME_CASE(FMAX_LEGACY)
5434 NODE_NAME_CASE(FMIN_LEGACY)
5435 NODE_NAME_CASE(FMAX3)
5436 NODE_NAME_CASE(SMAX3)
5437 NODE_NAME_CASE(UMAX3)
5438 NODE_NAME_CASE(FMIN3)
5439 NODE_NAME_CASE(SMIN3)
5440 NODE_NAME_CASE(UMIN3)
5441 NODE_NAME_CASE(FMED3)
5442 NODE_NAME_CASE(SMED3)
5443 NODE_NAME_CASE(UMED3)
5444 NODE_NAME_CASE(FMAXIMUM3)
5445 NODE_NAME_CASE(FMINIMUM3)
5446 NODE_NAME_CASE(FDOT2)
5447 NODE_NAME_CASE(URECIP)
5448 NODE_NAME_CASE(DIV_SCALE)
5449 NODE_NAME_CASE(DIV_FMAS)
5450 NODE_NAME_CASE(DIV_FIXUP)
5451 NODE_NAME_CASE(FMAD_FTZ)
5452 NODE_NAME_CASE(RCP)
5453 NODE_NAME_CASE(RSQ)
5454 NODE_NAME_CASE(RCP_LEGACY)
5455 NODE_NAME_CASE(RCP_IFLAG)
5456 NODE_NAME_CASE(LOG)
5457 NODE_NAME_CASE(EXP)
5458 NODE_NAME_CASE(FMUL_LEGACY)
5459 NODE_NAME_CASE(RSQ_CLAMP)
5460 NODE_NAME_CASE(FP_CLASS)
5461 NODE_NAME_CASE(DOT4)
5462 NODE_NAME_CASE(CARRY)
5463 NODE_NAME_CASE(BORROW)
5464 NODE_NAME_CASE(BFE_U32)
5465 NODE_NAME_CASE(BFE_I32)
5466 NODE_NAME_CASE(BFI)
5467 NODE_NAME_CASE(BFM)
5468 NODE_NAME_CASE(FFBH_U32)
5469 NODE_NAME_CASE(FFBH_I32)
5470 NODE_NAME_CASE(FFBL_B32)
5471 NODE_NAME_CASE(MUL_U24)
5472 NODE_NAME_CASE(MUL_I24)
5473 NODE_NAME_CASE(MULHI_U24)
5474 NODE_NAME_CASE(MULHI_I24)
5475 NODE_NAME_CASE(MAD_U24)
5476 NODE_NAME_CASE(MAD_I24)
5477 NODE_NAME_CASE(MAD_I64_I32)
5478 NODE_NAME_CASE(MAD_U64_U32)
5479 NODE_NAME_CASE(PERM)
5480 NODE_NAME_CASE(TEXTURE_FETCH)
5481 NODE_NAME_CASE(R600_EXPORT)
5482 NODE_NAME_CASE(CONST_ADDRESS)
5483 NODE_NAME_CASE(REGISTER_LOAD)
5484 NODE_NAME_CASE(REGISTER_STORE)
5485 NODE_NAME_CASE(SAMPLE)
5486 NODE_NAME_CASE(SAMPLEB)
5487 NODE_NAME_CASE(SAMPLED)
5488 NODE_NAME_CASE(SAMPLEL)
5489 NODE_NAME_CASE(CVT_F32_UBYTE0)
5490 NODE_NAME_CASE(CVT_F32_UBYTE1)
5491 NODE_NAME_CASE(CVT_F32_UBYTE2)
5492 NODE_NAME_CASE(CVT_F32_UBYTE3)
5493 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5494 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5495 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5496 NODE_NAME_CASE(CVT_PK_I16_I32)
5497 NODE_NAME_CASE(CVT_PK_U16_U32)
5498 NODE_NAME_CASE(FP_TO_FP16)
5499 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5500 NODE_NAME_CASE(CONST_DATA_PTR)
5501 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5503 NODE_NAME_CASE(FPTRUNC_ROUND)
5504 NODE_NAME_CASE(DUMMY_CHAIN)
5506 NODE_NAME_CASE(LOAD_D16_HI)
5507 NODE_NAME_CASE(LOAD_D16_LO)
5508 NODE_NAME_CASE(LOAD_D16_HI_I8)
5509 NODE_NAME_CASE(LOAD_D16_HI_U8)
5510 NODE_NAME_CASE(LOAD_D16_LO_I8)
5511 NODE_NAME_CASE(LOAD_D16_LO_U8)
5512 NODE_NAME_CASE(STORE_MSKOR)
5513 NODE_NAME_CASE(LOAD_CONSTANT)
5514 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5515 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5516 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5517 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5518 NODE_NAME_CASE(DS_ORDERED_COUNT)
5519 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5520 NODE_NAME_CASE(BUFFER_LOAD)
5521 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5522 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5523 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5524 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5525 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5526 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5527 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5528 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5529 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5530 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5531 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5532 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5533 NODE_NAME_CASE(SBUFFER_LOAD)
5534 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5535 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5536 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5537 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5538 NODE_NAME_CASE(BUFFER_STORE)
5539 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5540 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5541 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5542 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5543 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5544 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5545 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5546 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5547 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5548 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5549 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5550 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5551 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5552 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5553 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5554 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5555 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5556 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5557 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5558 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5559 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5560 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5561
5563 }
5564 return nullptr;
5565}
5566
5568 SelectionDAG &DAG, int Enabled,
5569 int &RefinementSteps,
5570 bool &UseOneConstNR,
5571 bool Reciprocal) const {
5572 EVT VT = Operand.getValueType();
5573
5574 if (VT == MVT::f32) {
5575 RefinementSteps = 0;
5576 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5577 }
5578
5579 // TODO: There is also f64 rsq instruction, but the documentation is less
5580 // clear on its precision.
5581
5582 return SDValue();
5583}
5584
5586 SelectionDAG &DAG, int Enabled,
5587 int &RefinementSteps) const {
5588 EVT VT = Operand.getValueType();
5589
5590 if (VT == MVT::f32) {
5591 // Reciprocal, < 1 ulp error.
5592 //
5593 // This reciprocal approximation converges to < 0.5 ulp error with one
5594 // newton rhapson performed with two fused multiple adds (FMAs).
5595
5596 RefinementSteps = 0;
5597 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5598 }
5599
5600 // TODO: There is also f64 rcp instruction, but the documentation is less
5601 // clear on its precision.
5602
5603 return SDValue();
5604}
5605
5606static unsigned workitemIntrinsicDim(unsigned ID) {
5607 switch (ID) {
5608 case Intrinsic::amdgcn_workitem_id_x:
5609 return 0;
5610 case Intrinsic::amdgcn_workitem_id_y:
5611 return 1;
5612 case Intrinsic::amdgcn_workitem_id_z:
5613 return 2;
5614 default:
5615 llvm_unreachable("not a workitem intrinsic");
5616 }
5617}
5618
5620 const SDValue Op, KnownBits &Known,
5621 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5622
5623 Known.resetAll(); // Don't know anything.
5624
5625 unsigned Opc = Op.getOpcode();
5626
5627 switch (Opc) {
5628 default:
5629 break;
5630 case AMDGPUISD::CARRY:
5631 case AMDGPUISD::BORROW: {
5632 Known.Zero = APInt::getHighBitsSet(32, 31);
5633 break;
5634 }
5635
5636 case AMDGPUISD::BFE_I32:
5637 case AMDGPUISD::BFE_U32: {
5638 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5639 if (!CWidth)
5640 return;
5641
5642 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5643
5644 if (Opc == AMDGPUISD::BFE_U32)
5645 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5646
5647 break;
5648 }
5649 case AMDGPUISD::FP_TO_FP16: {
5650 unsigned BitWidth = Known.getBitWidth();
5651
5652 // High bits are zero.
5654 break;
5655 }
5656 case AMDGPUISD::MUL_U24:
5657 case AMDGPUISD::MUL_I24: {
5658 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5659 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5660 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5661 RHSKnown.countMinTrailingZeros();
5662 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5663 // Skip extra check if all bits are known zeros.
5664 if (TrailZ >= 32)
5665 break;
5666
5667 // Truncate to 24 bits.
5668 LHSKnown = LHSKnown.trunc(24);
5669 RHSKnown = RHSKnown.trunc(24);
5670
5671 if (Opc == AMDGPUISD::MUL_I24) {
5672 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5673 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5674 unsigned MaxValBits = LHSValBits + RHSValBits;
5675 if (MaxValBits > 32)
5676 break;
5677 unsigned SignBits = 32 - MaxValBits + 1;
5678 bool LHSNegative = LHSKnown.isNegative();
5679 bool LHSNonNegative = LHSKnown.isNonNegative();
5680 bool LHSPositive = LHSKnown.isStrictlyPositive();
5681 bool RHSNegative = RHSKnown.isNegative();
5682 bool RHSNonNegative = RHSKnown.isNonNegative();
5683 bool RHSPositive = RHSKnown.isStrictlyPositive();
5684
5685 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5686 Known.Zero.setHighBits(SignBits);
5687 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5688 Known.One.setHighBits(SignBits);
5689 } else {
5690 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5691 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5692 unsigned MaxValBits = LHSValBits + RHSValBits;
5693 if (MaxValBits >= 32)
5694 break;
5695 Known.Zero.setBitsFrom(MaxValBits);
5696 }
5697 break;
5698 }
5699 case AMDGPUISD::PERM: {
5700 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5701 if (!CMask)
5702 return;
5703
5704 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5705 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5706 unsigned Sel = CMask->getZExtValue();
5707
5708 for (unsigned I = 0; I < 32; I += 8) {
5709 unsigned SelBits = Sel & 0xff;
5710 if (SelBits < 4) {
5711 SelBits *= 8;
5712 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5713 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5714 } else if (SelBits < 7) {
5715 SelBits = (SelBits & 3) * 8;
5716 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5717 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5718 } else if (SelBits == 0x0c) {
5719 Known.Zero |= 0xFFull << I;
5720 } else if (SelBits > 0x0c) {
5721 Known.One |= 0xFFull << I;
5722 }
5723 Sel >>= 8;
5724 }
5725 break;
5726 }
5728 Known.Zero.setHighBits(24);
5729 break;
5730 }
5732 Known.Zero.setHighBits(16);
5733 break;
5734 }
5735 case AMDGPUISD::LDS: {
5736 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5737 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5738
5739 Known.Zero.setHighBits(16);
5740 Known.Zero.setLowBits(Log2(Alignment));
5741 break;
5742 }
5743 case AMDGPUISD::SMIN3:
5744 case AMDGPUISD::SMAX3:
5745 case AMDGPUISD::SMED3:
5746 case AMDGPUISD::UMIN3:
5747 case AMDGPUISD::UMAX3:
5748 case AMDGPUISD::UMED3: {
5749 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5750 if (Known2.isUnknown())
5751 break;
5752
5753 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5754 if (Known1.isUnknown())
5755 break;
5756
5757 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5758 if (Known0.isUnknown())
5759 break;
5760
5761 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5762 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5763 Known.One = Known0.One & Known1.One & Known2.One;
5764 break;
5765 }
5767 unsigned IID = Op.getConstantOperandVal(0);
5768 switch (IID) {
5769 case Intrinsic::amdgcn_workitem_id_x:
5770 case Intrinsic::amdgcn_workitem_id_y:
5771 case Intrinsic::amdgcn_workitem_id_z: {
5772 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5774 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5775 break;
5776 }
5777 default:
5778 break;
5779 }
5780 }
5781 }
5782}
5783
5785 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5786 unsigned Depth) const {
5787 switch (Op.getOpcode()) {
5788 case AMDGPUISD::BFE_I32: {
5789 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5790 if (!Width)
5791 return 1;
5792
5793 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5794 if (!isNullConstant(Op.getOperand(1)))
5795 return SignBits;
5796
5797 // TODO: Could probably figure something out with non-0 offsets.
5798 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5799 return std::max(SignBits, Op0SignBits);
5800 }
5801
5802 case AMDGPUISD::BFE_U32: {
5803 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5804 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5805 }
5806
5807 case AMDGPUISD::CARRY:
5808 case AMDGPUISD::BORROW:
5809 return 31;
5811 return 25;
5813 return 17;
5815 return 24;
5817 return 16;
5819 return 16;
5820 case AMDGPUISD::SMIN3:
5821 case AMDGPUISD::SMAX3:
5822 case AMDGPUISD::SMED3:
5823 case AMDGPUISD::UMIN3:
5824 case AMDGPUISD::UMAX3:
5825 case AMDGPUISD::UMED3: {
5826 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5827 if (Tmp2 == 1)
5828 return 1; // Early out.
5829
5830 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5831 if (Tmp1 == 1)
5832 return 1; // Early out.
5833
5834 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5835 if (Tmp0 == 1)
5836 return 1; // Early out.
5837
5838 return std::min({Tmp0, Tmp1, Tmp2});
5839 }
5840 default:
5841 return 1;
5842 }
5843}
5844
5847 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5848 unsigned Depth) const {
5849 const MachineInstr *MI = MRI.getVRegDef(R);
5850 if (!MI)
5851 return 1;
5852
5853 // TODO: Check range metadata on MMO.
5854 switch (MI->getOpcode()) {
5855 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5856 return 25;
5857 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5858 return 17;
5859 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5860 return 24;
5861 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5862 return 16;
5863 case AMDGPU::G_AMDGPU_SMED3:
5864 case AMDGPU::G_AMDGPU_UMED3: {
5865 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5866 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5867 if (Tmp2 == 1)
5868 return 1;
5869 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5870 if (Tmp1 == 1)
5871 return 1;
5872 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5873 if (Tmp0 == 1)
5874 return 1;
5875 return std::min({Tmp0, Tmp1, Tmp2});
5876 }
5877 default:
5878 return 1;
5879 }
5880}
5881
5883 const SelectionDAG &DAG,
5884 bool SNaN,
5885 unsigned Depth) const {
5886 unsigned Opcode = Op.getOpcode();
5887 switch (Opcode) {
5890 if (SNaN)
5891 return true;
5892
5893 // TODO: Can check no nans on one of the operands for each one, but which
5894 // one?
5895 return false;
5896 }
5899 if (SNaN)
5900 return true;
5901 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5902 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5903 }
5904 case AMDGPUISD::FMED3:
5905 case AMDGPUISD::FMIN3:
5906 case AMDGPUISD::FMAX3:
5909 case AMDGPUISD::FMAD_FTZ: {
5910 if (SNaN)
5911 return true;
5912 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5913 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5914 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5915 }
5920 return true;
5921
5922 case AMDGPUISD::RCP:
5923 case AMDGPUISD::RSQ:
5925 case AMDGPUISD::RSQ_CLAMP: {
5926 if (SNaN)
5927 return true;
5928
5929 // TODO: Need is known positive check.
5930 return false;
5931 }
5932 case ISD::FLDEXP:
5933 case AMDGPUISD::FRACT: {
5934 if (SNaN)
5935 return true;
5936 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5937 }
5941 // TODO: Refine on operands.
5942 return SNaN;
5943 case AMDGPUISD::SIN_HW:
5944 case AMDGPUISD::COS_HW: {
5945 // TODO: Need check for infinity
5946 return SNaN;
5947 }
5949 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5950 // TODO: Handle more intrinsics
5951 switch (IntrinsicID) {
5952 case Intrinsic::amdgcn_cubeid:
5953 return true;
5954
5955 case Intrinsic::amdgcn_frexp_mant: {
5956 if (SNaN)
5957 return true;
5958 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5959 }
5960 case Intrinsic::amdgcn_cvt_pkrtz: {
5961 if (SNaN)
5962 return true;
5963 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5964 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5965 }
5966 case Intrinsic::amdgcn_rcp:
5967 case Intrinsic::amdgcn_rsq:
5968 case Intrinsic::amdgcn_rcp_legacy:
5969 case Intrinsic::amdgcn_rsq_legacy:
5970 case Intrinsic::amdgcn_rsq_clamp: {
5971 if (SNaN)
5972 return true;
5973
5974 // TODO: Need is known positive check.
5975 return false;
5976 }
5977 case Intrinsic::amdgcn_trig_preop:
5978 case Intrinsic::amdgcn_fdot2:
5979 // TODO: Refine on operand
5980 return SNaN;
5981 case Intrinsic::amdgcn_fma_legacy:
5982 if (SNaN)
5983 return true;
5984 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5985 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5986 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5987 default:
5988 return false;
5989 }
5990 }
5991 default:
5992 return false;
5993 }
5994}
5995
5997 Register N0, Register N1) const {
5998 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5999}
6000
6001/// Whether it is profitable to sink the operands of an
6002/// Instruction I to the basic block of I.
6003/// This helps using several modifiers (like abs and neg) more often.
6005 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6006 using namespace PatternMatch;
6007
6008 for (auto &Op : I->operands()) {
6009 // Ensure we are not already sinking this operand.
6010 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6011 continue;
6012
6013 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6014 Ops.push_back(&Op);
6015 }
6016
6017 return !Ops.empty();
6018}
unsigned const MachineRegisterInfo * MRI
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:216
#define LLVM_READONLY
Definition: Compiler.h:223
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1325
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1098
const fltSemantics & getSemantics() const
Definition: APFloat.h:1368
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1116
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1070
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1010
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1498
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:384
iterator_range< arg_iterator > args()
Definition: Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:299
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:226
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:567
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:493
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:487
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:488
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:482
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:500
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:576
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:570
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:752
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1309
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1099
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:963
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:953
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:996
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:751
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:980
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1120
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1124
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:521
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1305
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1041
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:771
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1028
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1109
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:848
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:765
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1165
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1047
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:886
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1276
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1001
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1162
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1603
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1583
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
MaybeAlign getAlign(const Function &F, unsigned Index)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1452
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:281
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:254
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:282
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:279
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:381
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:463
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:359
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:234
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:371
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:416
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:456
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:398
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:367
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:405
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:314
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:283
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:319
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:142
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:306
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:327
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:299
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:97
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:231
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:103
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:94
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:258
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...