LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
423 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
424 Custom);
425
426 if (Subtarget->hasPkMovB32()) {
427 // TODO: 16-bit element vectors should be legal with even aligned elements.
428 // TODO: Can be legal with wider source types than the result with
429 // subregister extracts.
430 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
431 }
432
433 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
434 Custom);
435
436 // Avoid stack access for these.
437 // TODO: Generalize to more vector types.
439 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
440 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
441 Custom);
442
443 // Deal with vec3 vector operations when widened to vec4.
445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
446
447 // Deal with vec5/6/7 vector operations when widened to vec8.
449 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
450 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
451 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
452 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
453 Custom);
454
455 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
456 // and output demarshalling
457 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
458
459 // We can't return success/failure, only the old value,
460 // let LLVM add the comparison
462 Expand);
463
464 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
465
466 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
467
468 // FIXME: This should be narrowed to i32, but that only happens if i64 is
469 // illegal.
470 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
471 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
472
473 // On SI this is s_memtime and s_memrealtime on VI.
475
476 if (Subtarget->hasSMemRealTime() ||
480
481 if (Subtarget->has16BitInsts()) {
484 } else {
486 }
487
488 if (Subtarget->hasMadMacF32Insts())
490
491 if (!Subtarget->hasBFI())
492 // fcopysign can be done in a single instruction with BFI.
493 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
494
495 if (!Subtarget->hasBCNT(32))
497
498 if (!Subtarget->hasBCNT(64))
500
501 if (Subtarget->hasFFBH())
503
504 if (Subtarget->hasFFBL())
506
507 // We only really have 32-bit BFE instructions (and 16-bit on VI).
508 //
509 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
510 // effort to match them now. We want this to be false for i64 cases when the
511 // extraction isn't restricted to the upper or lower half. Ideally we would
512 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
513 // span the midpoint are probably relatively rare, so don't worry about them
514 // for now.
515 if (Subtarget->hasBFE())
517
518 // Clamp modifier on add/sub
519 if (Subtarget->hasIntClamp())
521
522 if (Subtarget->hasAddNoCarry())
523 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
524 Legal);
525
526 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
527 Custom);
528
529 // These are really only legal for ieee_mode functions. We should be avoiding
530 // them for functions that don't have ieee_mode enabled, so just say they are
531 // legal.
533 {MVT::f32, MVT::f64}, Legal);
534
535 if (Subtarget->haveRoundOpsF64())
537 Legal);
538 else
540 MVT::f64, Custom);
541
543 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
544 Legal);
545 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
546
549
550 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
551 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
552
553 // Custom lower these because we can't specify a rule based on an illegal
554 // source bf16.
557
558 if (Subtarget->has16BitInsts()) {
561 MVT::i16, Legal);
562
563 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
564
566 MVT::i16, Expand);
567
571 ISD::CTPOP},
572 MVT::i16, Promote);
573
575
576 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
577
579 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
581 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
582
586
588
589 // F16 - Constant Actions.
592
593 // F16 - Load/Store Actions.
595 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
597 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
598
599 // BF16 - Load/Store Actions.
601 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
603 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
604
605 // F16 - VOP1 Actions.
608 MVT::f16, Custom);
609
612
613 // F16 - VOP2 Actions.
614 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
615 Expand);
619
620 // F16 - VOP3 Actions.
622 if (STI.hasMadF16())
624
625 for (MVT VT :
626 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
627 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
628 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
629 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
630 switch (Op) {
631 case ISD::LOAD:
632 case ISD::STORE:
634 case ISD::BITCAST:
635 case ISD::UNDEF:
640 case ISD::IS_FPCLASS:
641 break;
645 break;
646 default:
648 break;
649 }
650 }
651 }
652
653 // v_perm_b32 can handle either of these.
654 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
656
657 // XXX - Do these do anything? Vector constants turn into build_vector.
658 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
659
660 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
661 Legal);
662
664 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
666 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
667
669 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
671 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
672
673 setOperationAction(ISD::AND, MVT::v2i16, Promote);
674 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
675 setOperationAction(ISD::OR, MVT::v2i16, Promote);
676 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
677 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
678 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
679
681 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
683 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
684 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
685 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
686
688 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
690 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
692 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
693
695 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
697 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
698 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
699 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
700
702 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
704 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
705
707 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
709 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
711 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
712
713 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
715 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
716 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
719
721 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
723 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
724 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
725 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
726
727 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
729 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
733
735 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
737 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
738 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
739 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
740
742 MVT::v2i32, Expand);
744
746 MVT::v4i32, Expand);
747
749 MVT::v8i32, Expand);
750
751 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
752 Subtarget->hasVOP3PInsts() ? Legal : Custom);
753
754 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
755 // This isn't really legal, but this avoids the legalizer unrolling it (and
756 // allows matching fneg (fabs x) patterns)
757 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
758
761
764 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765 Custom);
766
768 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
769 Expand);
770
771 for (MVT Vec16 :
772 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
773 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
776 Vec16, Custom);
778 }
779 }
780
781 if (Subtarget->hasVOP3PInsts()) {
785 MVT::v2i16, Legal);
786
789 MVT::v2f16, Legal);
790
792 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
793
795 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
796 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
797 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
798 Custom);
799
800 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
801 // Split vector operations.
806 VT, Custom);
807
808 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
809 // Split vector operations.
811 VT, Custom);
812
813 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
814 Custom);
815
816 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
817 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
818 Custom);
819
820 if (Subtarget->hasPackedFP32Ops()) {
822 MVT::v2f32, Legal);
824 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
825 Custom);
826 }
827 }
828
830
831 if (Subtarget->has16BitInsts()) {
833 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
835 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
836 } else {
837 // Legalization hack.
838 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
839
841 }
842
844 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
845 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
846 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
847 MVT::v32f16, MVT::v32bf16},
848 Custom);
849
851
852 if (Subtarget->hasScalarSMulU64())
854
855 if (Subtarget->hasMad64_32())
857
858 if (Subtarget->hasPrefetch())
860
861 if (Subtarget->hasIEEEMinMax()) {
863 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
865 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
866 Custom);
867 } else {
868 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
869 if (Subtarget->hasMinimum3Maximum3F32())
871
872 if (Subtarget->hasMinimum3Maximum3PKF16())
874 }
875
877 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
878 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
879 MVT::i8},
880 Custom);
881
883 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
884 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
885 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
886 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
887 Custom);
888
890 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
891 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
892 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
893 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
894 Custom);
895
901
902 // TODO: Could move this to custom lowering, could benefit from combines on
903 // extract of relevant bits.
905
907
908 if (Subtarget->hasBF16ConversionInsts()) {
912 }
913
914 if (Subtarget->hasCvtPkF16F32Inst()) {
916 }
917
920 ISD::SUB,
922 ISD::MUL,
923 ISD::FADD,
924 ISD::FSUB,
925 ISD::FDIV,
926 ISD::FMUL,
933 ISD::FMA,
934 ISD::SMIN,
935 ISD::SMAX,
936 ISD::UMIN,
937 ISD::UMAX,
940 ISD::SMIN,
941 ISD::SMAX,
942 ISD::UMIN,
943 ISD::UMAX,
944 ISD::AND,
945 ISD::OR,
946 ISD::XOR,
947 ISD::SHL,
948 ISD::SRL,
949 ISD::SRA,
950 ISD::FSHR,
960
961 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
963
964 // All memory operations. Some folding on the pointer operand is done to help
965 // matching the constant offsets in the addressing modes.
990
991 // FIXME: In other contexts we pretend this is a per-function property.
993
995}
996
997const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
998
1000 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1001 return RCRegs;
1002}
1003
1004//===----------------------------------------------------------------------===//
1005// TargetLowering queries
1006//===----------------------------------------------------------------------===//
1007
1008// v_mad_mix* support a conversion from f16 to f32.
1009//
1010// There is only one special case when denormals are enabled we don't currently,
1011// where this is OK to use.
1012bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1013 EVT DestVT, EVT SrcVT) const {
1014 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1015 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1016 DestVT.getScalarType() == MVT::f32 &&
1017 SrcVT.getScalarType() == MVT::f16 &&
1018 // TODO: This probably only requires no input flushing?
1020}
1021
1023 LLT DestTy, LLT SrcTy) const {
1024 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1025 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1026 DestTy.getScalarSizeInBits() == 32 &&
1027 SrcTy.getScalarSizeInBits() == 16 &&
1028 // TODO: This probably only requires no input flushing?
1029 denormalModeIsFlushAllF32(*MI.getMF());
1030}
1031
1033 // SI has some legal vector types, but no legal vector operations. Say no
1034 // shuffles are legal in order to prefer scalarizing some vector operations.
1035 return false;
1036}
1037
1040 EVT VT) const {
1043
1044 if (VT.isVector()) {
1045 EVT ScalarVT = VT.getScalarType();
1046 unsigned Size = ScalarVT.getSizeInBits();
1047 if (Size == 16) {
1048 if (Subtarget->has16BitInsts()) {
1049 if (VT.isInteger())
1050 return MVT::v2i16;
1051 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1052 }
1053 return VT.isInteger() ? MVT::i32 : MVT::f32;
1054 }
1055
1056 if (Size < 16)
1057 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1058 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1059 }
1060
1061 if (VT.getSizeInBits() > 32)
1062 return MVT::i32;
1063
1065}
1066
1069 EVT VT) const {
1072
1073 if (VT.isVector()) {
1074 unsigned NumElts = VT.getVectorNumElements();
1075 EVT ScalarVT = VT.getScalarType();
1076 unsigned Size = ScalarVT.getSizeInBits();
1077
1078 // FIXME: Should probably promote 8-bit vectors to i16.
1079 if (Size == 16 && Subtarget->has16BitInsts())
1080 return (NumElts + 1) / 2;
1081
1082 if (Size <= 32)
1083 return NumElts;
1084
1085 if (Size > 32)
1086 return NumElts * ((Size + 31) / 32);
1087 } else if (VT.getSizeInBits() > 32)
1088 return (VT.getSizeInBits() + 31) / 32;
1089
1091}
1092
1094 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1095 unsigned &NumIntermediates, MVT &RegisterVT) const {
1096 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1097 unsigned NumElts = VT.getVectorNumElements();
1098 EVT ScalarVT = VT.getScalarType();
1099 unsigned Size = ScalarVT.getSizeInBits();
1100 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1101 // support, but unless we can properly handle 3-vectors, it will be still be
1102 // inconsistent.
1103 if (Size == 16 && Subtarget->has16BitInsts()) {
1104 if (ScalarVT == MVT::bf16) {
1105 RegisterVT = MVT::i32;
1106 IntermediateVT = MVT::v2bf16;
1107 } else {
1108 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1109 IntermediateVT = RegisterVT;
1110 }
1111 NumIntermediates = (NumElts + 1) / 2;
1112 return NumIntermediates;
1113 }
1114
1115 if (Size == 32) {
1116 RegisterVT = ScalarVT.getSimpleVT();
1117 IntermediateVT = RegisterVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1120 }
1121
1122 if (Size < 16 && Subtarget->has16BitInsts()) {
1123 // FIXME: Should probably form v2i16 pieces
1124 RegisterVT = MVT::i16;
1125 IntermediateVT = ScalarVT;
1126 NumIntermediates = NumElts;
1127 return NumIntermediates;
1128 }
1129
1130 if (Size != 16 && Size <= 32) {
1131 RegisterVT = MVT::i32;
1132 IntermediateVT = ScalarVT;
1133 NumIntermediates = NumElts;
1134 return NumIntermediates;
1135 }
1136
1137 if (Size > 32) {
1138 RegisterVT = MVT::i32;
1139 IntermediateVT = RegisterVT;
1140 NumIntermediates = NumElts * ((Size + 31) / 32);
1141 return NumIntermediates;
1142 }
1143 }
1144
1146 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1147}
1148
1150 const DataLayout &DL, Type *Ty,
1151 unsigned MaxNumLanes) {
1152 assert(MaxNumLanes != 0);
1153
1154 LLVMContext &Ctx = Ty->getContext();
1155 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1156 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1157 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1158 NumElts);
1159 }
1160
1161 return TLI.getValueType(DL, Ty);
1162}
1163
1164// Peek through TFE struct returns to only use the data size.
1166 const DataLayout &DL, Type *Ty,
1167 unsigned MaxNumLanes) {
1168 auto *ST = dyn_cast<StructType>(Ty);
1169 if (!ST)
1170 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1171
1172 // TFE intrinsics return an aggregate type.
1173 assert(ST->getNumContainedTypes() == 2 &&
1174 ST->getContainedType(1)->isIntegerTy(32));
1175 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1176}
1177
1178/// Map address space 7 to MVT::v5i32 because that's its in-memory
1179/// representation. This return value is vector-typed because there is no
1180/// MVT::i160 and it is not clear if one can be added. While this could
1181/// cause issues during codegen, these address space 7 pointers will be
1182/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1183/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1184/// modeling, to work.
1186 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1187 return MVT::v5i32;
1189 DL.getPointerSizeInBits(AS) == 192)
1190 return MVT::v6i32;
1192}
1193/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1194/// v8i32 when padding is added.
1195/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1196/// also v8i32 with padding.
1198 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1199 DL.getPointerSizeInBits(AS) == 160) ||
1201 DL.getPointerSizeInBits(AS) == 192))
1202 return MVT::v8i32;
1204}
1205
1207 const CallInst &CI,
1208 MachineFunction &MF,
1209 unsigned IntrID) const {
1211 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1213 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1215 Info.flags |= getTargetMMOFlags(CI);
1216
1217 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1219 AttributeList Attr =
1221 MemoryEffects ME = Attr.getMemoryEffects();
1222 if (ME.doesNotAccessMemory())
1223 return false;
1224
1225 // TODO: Should images get their own address space?
1226 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1227
1228 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1229 if (RsrcIntr->IsImage) {
1232 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1233 Info.align.reset();
1234 }
1235
1236 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1237 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1238 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1239 // We conservatively set the memory operand of a buffer intrinsic to the
1240 // base resource pointer, so that we can access alias information about
1241 // those pointers. Cases like "this points at the same value
1242 // but with a different offset" are handled in
1243 // areMemAccessesTriviallyDisjoint.
1244 Info.ptrVal = RsrcArg;
1245 }
1246
1247 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1248 if (!IsSPrefetch) {
1249 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1250 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1252 }
1253
1255 if (ME.onlyReadsMemory()) {
1256 if (RsrcIntr->IsImage) {
1257 unsigned MaxNumLanes = 4;
1258
1259 if (!BaseOpcode->Gather4) {
1260 // If this isn't a gather, we may have excess loaded elements in the
1261 // IR type. Check the dmask for the real number of elements loaded.
1262 unsigned DMask =
1263 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1264 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1265 }
1266
1267 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1268 CI.getType(), MaxNumLanes);
1269 } else {
1270 Info.memVT =
1272 std::numeric_limits<unsigned>::max());
1273 }
1274
1275 // FIXME: What does alignment mean for an image?
1278 } else if (ME.onlyWritesMemory()) {
1280
1281 Type *DataTy = CI.getArgOperand(0)->getType();
1282 if (RsrcIntr->IsImage) {
1283 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1284 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1285 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1286 DMaskLanes);
1287 } else
1288 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1289
1291 } else {
1292 // Atomic, NoReturn Sampler or prefetch
1295 Info.flags |=
1297
1298 if (!IsSPrefetch)
1300
1301 switch (IntrID) {
1302 default:
1303 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1304 // Fake memory access type for no return sampler intrinsics
1305 Info.memVT = MVT::i32;
1306 } else {
1307 // XXX - Should this be volatile without known ordering?
1309 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1310 }
1311 break;
1312 case Intrinsic::amdgcn_raw_buffer_load_lds:
1313 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1314 case Intrinsic::amdgcn_struct_buffer_load_lds:
1315 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1316 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1317 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1318 Info.ptrVal = CI.getArgOperand(1);
1319 return true;
1320 }
1321 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1322 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1323 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1324 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1325 Info.memVT =
1327 std::numeric_limits<unsigned>::max());
1328 Info.flags &= ~MachineMemOperand::MOStore;
1329 return true;
1330 }
1331 }
1332 }
1333 return true;
1334 }
1335
1336 switch (IntrID) {
1337 case Intrinsic::amdgcn_ds_ordered_add:
1338 case Intrinsic::amdgcn_ds_ordered_swap: {
1340 Info.memVT = MVT::getVT(CI.getType());
1341 Info.ptrVal = CI.getOperand(0);
1342 Info.align.reset();
1344
1345 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1346 if (!Vol->isZero())
1348
1349 return true;
1350 }
1351 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1352 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1354 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1355 Info.ptrVal = nullptr;
1356 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1358 return true;
1359 }
1360 case Intrinsic::amdgcn_ds_append:
1361 case Intrinsic::amdgcn_ds_consume: {
1363 Info.memVT = MVT::getVT(CI.getType());
1364 Info.ptrVal = CI.getOperand(0);
1365 Info.align.reset();
1367
1368 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1369 if (!Vol->isZero())
1371
1372 return true;
1373 }
1374 case Intrinsic::amdgcn_global_atomic_csub: {
1376 Info.memVT = MVT::getVT(CI.getType());
1377 Info.ptrVal = CI.getOperand(0);
1378 Info.align.reset();
1381 return true;
1382 }
1383 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1385 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1386
1387 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1388 Info.align.reset();
1389 Info.flags |=
1391 return true;
1392 }
1393 case Intrinsic::amdgcn_global_atomic_fmin_num:
1394 case Intrinsic::amdgcn_global_atomic_fmax_num:
1395 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1396 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1397 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1398 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1400 Info.memVT = MVT::getVT(CI.getType());
1401 Info.ptrVal = CI.getOperand(0);
1402 Info.align.reset();
1406 return true;
1407 }
1408 case Intrinsic::amdgcn_global_load_tr_b64:
1409 case Intrinsic::amdgcn_global_load_tr_b128:
1410 case Intrinsic::amdgcn_ds_read_tr4_b64:
1411 case Intrinsic::amdgcn_ds_read_tr6_b96:
1412 case Intrinsic::amdgcn_ds_read_tr8_b64:
1413 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1415 Info.memVT = MVT::getVT(CI.getType());
1416 Info.ptrVal = CI.getOperand(0);
1417 Info.align.reset();
1419 return true;
1420 }
1421 case Intrinsic::amdgcn_ds_gws_init:
1422 case Intrinsic::amdgcn_ds_gws_barrier:
1423 case Intrinsic::amdgcn_ds_gws_sema_v:
1424 case Intrinsic::amdgcn_ds_gws_sema_br:
1425 case Intrinsic::amdgcn_ds_gws_sema_p:
1426 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1428
1429 const GCNTargetMachine &TM =
1430 static_cast<const GCNTargetMachine &>(getTargetMachine());
1431
1433 Info.ptrVal = MFI->getGWSPSV(TM);
1434
1435 // This is an abstract access, but we need to specify a type and size.
1436 Info.memVT = MVT::i32;
1437 Info.size = 4;
1438 Info.align = Align(4);
1439
1440 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1442 else
1444 return true;
1445 }
1446 case Intrinsic::amdgcn_global_load_lds: {
1448 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1449 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1450 Info.ptrVal = CI.getArgOperand(1);
1452 return true;
1453 }
1454 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1456
1457 const GCNTargetMachine &TM =
1458 static_cast<const GCNTargetMachine &>(getTargetMachine());
1459
1461 Info.ptrVal = MFI->getGWSPSV(TM);
1462
1463 // This is an abstract access, but we need to specify a type and size.
1464 Info.memVT = MVT::i32;
1465 Info.size = 4;
1466 Info.align = Align(4);
1467
1469 return true;
1470 }
1471 case Intrinsic::amdgcn_s_prefetch_data: {
1473 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1474 Info.ptrVal = CI.getArgOperand(0);
1476 return true;
1477 }
1478 default:
1479 return false;
1480 }
1481}
1482
1484 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1485 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1486 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1487 // The DAG's ValueType loses the addrspaces.
1488 // Add them as 2 extra Constant operands "from" and "to".
1489 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1490 unsigned DstAS = I.getType()->getPointerAddressSpace();
1491 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1492 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1493 break;
1494 }
1495 default:
1496 break;
1497 }
1498}
1499
1502 Type *&AccessTy) const {
1503 Value *Ptr = nullptr;
1504 switch (II->getIntrinsicID()) {
1505 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1506 case Intrinsic::amdgcn_ds_append:
1507 case Intrinsic::amdgcn_ds_consume:
1508 case Intrinsic::amdgcn_ds_read_tr4_b64:
1509 case Intrinsic::amdgcn_ds_read_tr6_b96:
1510 case Intrinsic::amdgcn_ds_read_tr8_b64:
1511 case Intrinsic::amdgcn_ds_read_tr16_b64:
1512 case Intrinsic::amdgcn_ds_ordered_add:
1513 case Intrinsic::amdgcn_ds_ordered_swap:
1514 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1515 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1516 case Intrinsic::amdgcn_global_atomic_csub:
1517 case Intrinsic::amdgcn_global_atomic_fmax_num:
1518 case Intrinsic::amdgcn_global_atomic_fmin_num:
1519 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1520 case Intrinsic::amdgcn_global_load_tr_b64:
1521 case Intrinsic::amdgcn_global_load_tr_b128:
1522 Ptr = II->getArgOperand(0);
1523 break;
1524 case Intrinsic::amdgcn_global_load_lds:
1525 Ptr = II->getArgOperand(1);
1526 break;
1527 default:
1528 return false;
1529 }
1530 AccessTy = II->getType();
1531 Ops.push_back(Ptr);
1532 return true;
1533}
1534
1536 unsigned AddrSpace) const {
1537 if (!Subtarget->hasFlatInstOffsets()) {
1538 // Flat instructions do not have offsets, and only have the register
1539 // address.
1540 return AM.BaseOffs == 0 && AM.Scale == 0;
1541 }
1542
1543 decltype(SIInstrFlags::FLAT) FlatVariant =
1547
1548 return AM.Scale == 0 &&
1549 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1550 AM.BaseOffs, AddrSpace, FlatVariant));
1551}
1552
1554 if (Subtarget->hasFlatGlobalInsts())
1556
1557 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1558 // Assume the we will use FLAT for all global memory accesses
1559 // on VI.
1560 // FIXME: This assumption is currently wrong. On VI we still use
1561 // MUBUF instructions for the r + i addressing mode. As currently
1562 // implemented, the MUBUF instructions only work on buffer < 4GB.
1563 // It may be possible to support > 4GB buffers with MUBUF instructions,
1564 // by setting the stride value in the resource descriptor which would
1565 // increase the size limit to (stride * 4GB). However, this is risky,
1566 // because it has never been validated.
1568 }
1569
1570 return isLegalMUBUFAddressingMode(AM);
1571}
1572
1573bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1574 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1575 // additionally can do r + r + i with addr64. 32-bit has more addressing
1576 // mode options. Depending on the resource constant, it can also do
1577 // (i64 r0) + (i32 r1) * (i14 i).
1578 //
1579 // Private arrays end up using a scratch buffer most of the time, so also
1580 // assume those use MUBUF instructions. Scratch loads / stores are currently
1581 // implemented as mubuf instructions with offen bit set, so slightly
1582 // different than the normal addr64.
1583 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1584 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1585 return false;
1586
1587 // FIXME: Since we can split immediate into soffset and immediate offset,
1588 // would it make sense to allow any immediate?
1589
1590 switch (AM.Scale) {
1591 case 0: // r + i or just i, depending on HasBaseReg.
1592 return true;
1593 case 1:
1594 return true; // We have r + r or r + i.
1595 case 2:
1596 if (AM.HasBaseReg) {
1597 // Reject 2 * r + r.
1598 return false;
1599 }
1600
1601 // Allow 2 * r as r + r
1602 // Or 2 * r + i is allowed as r + r + i.
1603 return true;
1604 default: // Don't allow n * r
1605 return false;
1606 }
1607}
1608
1610 const AddrMode &AM, Type *Ty,
1611 unsigned AS,
1612 Instruction *I) const {
1613 // No global is ever allowed as a base.
1614 if (AM.BaseGV)
1615 return false;
1616
1617 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1618 return isLegalGlobalAddressingMode(AM);
1619
1620 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1624 // If the offset isn't a multiple of 4, it probably isn't going to be
1625 // correctly aligned.
1626 // FIXME: Can we get the real alignment here?
1627 if (AM.BaseOffs % 4 != 0)
1628 return isLegalMUBUFAddressingMode(AM);
1629
1630 if (!Subtarget->hasScalarSubwordLoads()) {
1631 // There are no SMRD extloads, so if we have to do a small type access we
1632 // will use a MUBUF load.
1633 // FIXME?: We also need to do this if unaligned, but we don't know the
1634 // alignment here.
1635 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1636 return isLegalGlobalAddressingMode(AM);
1637 }
1638
1640 // SMRD instructions have an 8-bit, dword offset on SI.
1641 if (!isUInt<8>(AM.BaseOffs / 4))
1642 return false;
1643 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1644 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1645 // in 8-bits, it can use a smaller encoding.
1646 if (!isUInt<32>(AM.BaseOffs / 4))
1647 return false;
1648 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1649 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1650 if (!isUInt<20>(AM.BaseOffs))
1651 return false;
1652 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1653 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1654 // for S_BUFFER_* instructions).
1655 if (!isInt<21>(AM.BaseOffs))
1656 return false;
1657 } else {
1658 // On GFX12, all offsets are signed 24-bit in bytes.
1659 if (!isInt<24>(AM.BaseOffs))
1660 return false;
1661 }
1662
1663 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1665 AM.BaseOffs < 0) {
1666 // Scalar (non-buffer) loads can only use a negative offset if
1667 // soffset+offset is non-negative. Since the compiler can only prove that
1668 // in a few special cases, it is safer to claim that negative offsets are
1669 // not supported.
1670 return false;
1671 }
1672
1673 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1674 return true;
1675
1676 if (AM.Scale == 1 && AM.HasBaseReg)
1677 return true;
1678
1679 return false;
1680 }
1681
1682 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1683 return Subtarget->enableFlatScratch()
1685 : isLegalMUBUFAddressingMode(AM);
1686
1687 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1688 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1689 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1690 // field.
1691 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1692 // an 8-bit dword offset but we don't know the alignment here.
1693 if (!isUInt<16>(AM.BaseOffs))
1694 return false;
1695
1696 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1697 return true;
1698
1699 if (AM.Scale == 1 && AM.HasBaseReg)
1700 return true;
1701
1702 return false;
1703 }
1704
1706 // For an unknown address space, this usually means that this is for some
1707 // reason being used for pure arithmetic, and not based on some addressing
1708 // computation. We don't have instructions that compute pointers with any
1709 // addressing modes, so treat them as having no offset like flat
1710 // instructions.
1712 }
1713
1714 // Assume a user alias of global for unknown address spaces.
1715 return isLegalGlobalAddressingMode(AM);
1716}
1717
1719 const MachineFunction &MF) const {
1721 return (MemVT.getSizeInBits() <= 4 * 32);
1722 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1723 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1724 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1725 }
1727 return (MemVT.getSizeInBits() <= 2 * 32);
1728 return true;
1729}
1730
1732 unsigned Size, unsigned AddrSpace, Align Alignment,
1733 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1734 if (IsFast)
1735 *IsFast = 0;
1736
1737 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1738 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1739 // Check if alignment requirements for ds_read/write instructions are
1740 // disabled.
1741 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1742 return false;
1743
1744 Align RequiredAlignment(
1745 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1746 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1747 Alignment < RequiredAlignment)
1748 return false;
1749
1750 // Either, the alignment requirements are "enabled", or there is an
1751 // unaligned LDS access related hardware bug though alignment requirements
1752 // are "disabled". In either case, we need to check for proper alignment
1753 // requirements.
1754 //
1755 switch (Size) {
1756 case 64:
1757 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1758 // address is negative, then the instruction is incorrectly treated as
1759 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1760 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1761 // load later in the SILoadStoreOptimizer.
1762 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1763 return false;
1764
1765 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1766 // can do a 4 byte aligned, 8 byte access in a single operation using
1767 // ds_read2/write2_b32 with adjacent offsets.
1768 RequiredAlignment = Align(4);
1769
1770 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1771 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1772 // ds_write2_b32 depending on the alignment. In either case with either
1773 // alignment there is no faster way of doing this.
1774
1775 // The numbers returned here and below are not additive, it is a 'speed
1776 // rank'. They are just meant to be compared to decide if a certain way
1777 // of lowering an operation is faster than another. For that purpose
1778 // naturally aligned operation gets it bitsize to indicate that "it
1779 // operates with a speed comparable to N-bit wide load". With the full
1780 // alignment ds128 is slower than ds96 for example. If underaligned it
1781 // is comparable to a speed of a single dword access, which would then
1782 // mean 32 < 128 and it is faster to issue a wide load regardless.
1783 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1784 // wider load which will not be aligned anymore the latter is slower.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? 64
1787 : (Alignment < Align(4)) ? 32
1788 : 1;
1789 return true;
1790 }
1791
1792 break;
1793 case 96:
1794 if (!Subtarget->hasDS96AndDS128())
1795 return false;
1796
1797 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1798 // gfx8 and older.
1799
1800 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1801 // Naturally aligned access is fastest. However, also report it is Fast
1802 // if memory is aligned less than DWORD. A narrow load or store will be
1803 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1804 // be more of them, so overall we will pay less penalty issuing a single
1805 // instruction.
1806
1807 // See comment on the values above.
1808 if (IsFast)
1809 *IsFast = (Alignment >= RequiredAlignment) ? 96
1810 : (Alignment < Align(4)) ? 32
1811 : 1;
1812 return true;
1813 }
1814
1815 break;
1816 case 128:
1817 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1818 return false;
1819
1820 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1821 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1822 // single operation using ds_read2/write2_b64.
1823 RequiredAlignment = Align(8);
1824
1825 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1826 // Naturally aligned access is fastest. However, also report it is Fast
1827 // if memory is aligned less than DWORD. A narrow load or store will be
1828 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1829 // will be more of them, so overall we will pay less penalty issuing a
1830 // single instruction.
1831
1832 // See comment on the values above.
1833 if (IsFast)
1834 *IsFast = (Alignment >= RequiredAlignment) ? 128
1835 : (Alignment < Align(4)) ? 32
1836 : 1;
1837 return true;
1838 }
1839
1840 break;
1841 default:
1842 if (Size > 32)
1843 return false;
1844
1845 break;
1846 }
1847
1848 // See comment on the values above.
1849 // Note that we have a single-dword or sub-dword here, so if underaligned
1850 // it is a slowest possible access, hence returned value is 0.
1851 if (IsFast)
1852 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1853
1854 return Alignment >= RequiredAlignment ||
1855 Subtarget->hasUnalignedDSAccessEnabled();
1856 }
1857
1858 // FIXME: We have to be conservative here and assume that flat operations
1859 // will access scratch. If we had access to the IR function, then we
1860 // could determine if any private memory was used in the function.
1861 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1862 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1863 bool AlignedBy4 = Alignment >= Align(4);
1864 if (IsFast)
1865 *IsFast = AlignedBy4;
1866
1867 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1868 }
1869
1870 // So long as they are correct, wide global memory operations perform better
1871 // than multiple smaller memory ops -- even when misaligned
1872 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1873 if (IsFast)
1874 *IsFast = Size;
1875
1876 return Alignment >= Align(4) ||
1878 }
1879
1880 // Smaller than dword value must be aligned.
1881 if (Size < 32)
1882 return false;
1883
1884 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1885 // byte-address are ignored, thus forcing Dword alignment.
1886 // This applies to private, global, and constant memory.
1887 if (IsFast)
1888 *IsFast = 1;
1889
1890 return Size >= 32 && Alignment >= Align(4);
1891}
1892
1894 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1895 unsigned *IsFast) const {
1897 Alignment, Flags, IsFast);
1898}
1899
1901 const MemOp &Op, const AttributeList &FuncAttributes) const {
1902 // FIXME: Should account for address space here.
1903
1904 // The default fallback uses the private pointer size as a guess for a type to
1905 // use. Make sure we switch these to 64-bit accesses.
1906
1907 if (Op.size() >= 16 &&
1908 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1909 return MVT::v4i32;
1910
1911 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1912 return MVT::v2i32;
1913
1914 // Use the default.
1915 return MVT::Other;
1916}
1917
1919 const MemSDNode *MemNode = cast<MemSDNode>(N);
1920 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1921}
1922
1924 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1926}
1927
1929 unsigned DestAS) const {
1930 // Flat -> private/local is a simple truncate.
1931 // Flat -> global is no-op
1932 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1933 return true;
1934
1935 const GCNTargetMachine &TM =
1936 static_cast<const GCNTargetMachine &>(getTargetMachine());
1937 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1938}
1939
1942 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1943 VT.getScalarType().bitsLE(MVT::i16))
1946}
1947
1949 Type *Ty) const {
1950 // FIXME: Could be smarter if called for vector constants.
1951 return true;
1952}
1953
1955 unsigned Index) const {
1957 return false;
1958
1959 // TODO: Add more cases that are cheap.
1960 return Index == 0;
1961}
1962
1963bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
1964 // TODO: This should be more aggressive, particular for 16-bit element
1965 // vectors. However there are some mixed improvements and regressions.
1966 EVT EltTy = VT.getVectorElementType();
1967 return EltTy.getSizeInBits() % 32 == 0;
1968}
1969
1971 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1972 switch (Op) {
1973 case ISD::LOAD:
1974 case ISD::STORE:
1975 return true;
1976 default:
1977 return false;
1978 }
1979 }
1980
1981 // SimplifySetCC uses this function to determine whether or not it should
1982 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1983 if (VT == MVT::i1 && Op == ISD::SETCC)
1984 return false;
1985
1987}
1988
1989SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1990 const SDLoc &SL,
1991 SDValue Chain,
1992 uint64_t Offset) const {
1993 const DataLayout &DL = DAG.getDataLayout();
1997
1998 auto [InputPtrReg, RC, ArgTy] =
2000
2001 // We may not have the kernarg segment argument if we have no kernel
2002 // arguments.
2003 if (!InputPtrReg)
2004 return DAG.getConstant(Offset, SL, PtrVT);
2005
2007 SDValue BasePtr = DAG.getCopyFromReg(
2008 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2009
2010 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2011}
2012
2013SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2014 const SDLoc &SL) const {
2017 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2018}
2019
2020SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2021 const SDLoc &SL) const {
2022
2024 std::optional<uint32_t> KnownSize =
2026 if (KnownSize.has_value())
2027 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2028 return SDValue();
2029}
2030
2031SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2032 const SDLoc &SL, SDValue Val,
2033 bool Signed,
2034 const ISD::InputArg *Arg) const {
2035 // First, if it is a widened vector, narrow it.
2036 if (VT.isVector() &&
2038 EVT NarrowedVT =
2041 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2042 DAG.getConstant(0, SL, MVT::i32));
2043 }
2044
2045 // Then convert the vector elements or scalar value.
2046 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2047 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2048 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2049 }
2050
2051 if (MemVT.isFloatingPoint())
2052 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2053 else if (Signed)
2054 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2055 else
2056 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2057
2058 return Val;
2059}
2060
2061SDValue SITargetLowering::lowerKernargMemParameter(
2062 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2063 uint64_t Offset, Align Alignment, bool Signed,
2064 const ISD::InputArg *Arg) const {
2066
2067 // Try to avoid using an extload by loading earlier than the argument address,
2068 // and extracting the relevant bits. The load should hopefully be merged with
2069 // the previous argument.
2070 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2071 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2072 int64_t AlignDownOffset = alignDown(Offset, 4);
2073 int64_t OffsetDiff = Offset - AlignDownOffset;
2074
2075 EVT IntVT = MemVT.changeTypeToInteger();
2076
2077 // TODO: If we passed in the base kernel offset we could have a better
2078 // alignment than 4, but we don't really need it.
2079 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2080 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2083
2084 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2085 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2086
2087 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2088 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2089 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2090
2091 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2092 }
2093
2094 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2095 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2098
2099 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2100 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2101}
2102
2103SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2104 CCValAssign &VA, const SDLoc &SL,
2105 SDValue Chain,
2106 const ISD::InputArg &Arg) const {
2108 MachineFrameInfo &MFI = MF.getFrameInfo();
2109
2110 if (Arg.Flags.isByVal()) {
2111 unsigned Size = Arg.Flags.getByValSize();
2112 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2113 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2114 }
2115
2116 unsigned ArgOffset = VA.getLocMemOffset();
2117 unsigned ArgSize = VA.getValVT().getStoreSize();
2118
2119 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2120
2121 // Create load nodes to retrieve arguments from the stack.
2122 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2123 SDValue ArgValue;
2124
2125 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2127 MVT MemVT = VA.getValVT();
2128
2129 switch (VA.getLocInfo()) {
2130 default:
2131 break;
2132 case CCValAssign::BCvt:
2133 MemVT = VA.getLocVT();
2134 break;
2135 case CCValAssign::SExt:
2136 ExtType = ISD::SEXTLOAD;
2137 break;
2138 case CCValAssign::ZExt:
2139 ExtType = ISD::ZEXTLOAD;
2140 break;
2141 case CCValAssign::AExt:
2142 ExtType = ISD::EXTLOAD;
2143 break;
2144 }
2145
2146 ArgValue = DAG.getExtLoad(
2147 ExtType, SL, VA.getLocVT(), Chain, FIN,
2149 return ArgValue;
2150}
2151
2152SDValue SITargetLowering::getPreloadedValue(
2153 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2155 const ArgDescriptor *Reg = nullptr;
2156 const TargetRegisterClass *RC;
2157 LLT Ty;
2158
2160 const ArgDescriptor WorkGroupIDX =
2161 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2162 // If GridZ is not programmed in an entry function then the hardware will set
2163 // it to all zeros, so there is no need to mask the GridY value in the low
2164 // order bits.
2165 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2166 AMDGPU::TTMP7,
2167 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2168 const ArgDescriptor WorkGroupIDZ =
2169 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2170 if (Subtarget->hasArchitectedSGPRs() &&
2172 switch (PVID) {
2174 Reg = &WorkGroupIDX;
2175 RC = &AMDGPU::SReg_32RegClass;
2176 Ty = LLT::scalar(32);
2177 break;
2179 Reg = &WorkGroupIDY;
2180 RC = &AMDGPU::SReg_32RegClass;
2181 Ty = LLT::scalar(32);
2182 break;
2184 Reg = &WorkGroupIDZ;
2185 RC = &AMDGPU::SReg_32RegClass;
2186 Ty = LLT::scalar(32);
2187 break;
2188 default:
2189 break;
2190 }
2191 }
2192
2193 if (!Reg)
2194 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2195 if (!Reg) {
2197 // It's possible for a kernarg intrinsic call to appear in a kernel with
2198 // no allocated segment, in which case we do not add the user sgpr
2199 // argument, so just return null.
2200 return DAG.getConstant(0, SDLoc(), VT);
2201 }
2202
2203 // It's undefined behavior if a function marked with the amdgpu-no-*
2204 // attributes uses the corresponding intrinsic.
2205 return DAG.getUNDEF(VT);
2206 }
2207
2208 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2209}
2210
2212 CallingConv::ID CallConv,
2213 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2214 FunctionType *FType,
2215 SIMachineFunctionInfo *Info) {
2216 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2217 const ISD::InputArg *Arg = &Ins[I];
2218
2219 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2220 "vector type argument should have been split");
2221
2222 // First check if it's a PS input addr.
2223 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2224 PSInputNum <= 15) {
2225 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2226
2227 // Inconveniently only the first part of the split is marked as isSplit,
2228 // so skip to the end. We only want to increment PSInputNum once for the
2229 // entire split argument.
2230 if (Arg->Flags.isSplit()) {
2231 while (!Arg->Flags.isSplitEnd()) {
2232 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2233 "unexpected vector split in ps argument type");
2234 if (!SkipArg)
2235 Splits.push_back(*Arg);
2236 Arg = &Ins[++I];
2237 }
2238 }
2239
2240 if (SkipArg) {
2241 // We can safely skip PS inputs.
2242 Skipped.set(Arg->getOrigArgIndex());
2243 ++PSInputNum;
2244 continue;
2245 }
2246
2247 Info->markPSInputAllocated(PSInputNum);
2248 if (Arg->Used)
2249 Info->markPSInputEnabled(PSInputNum);
2250
2251 ++PSInputNum;
2252 }
2253
2254 Splits.push_back(*Arg);
2255 }
2256}
2257
2258// Allocate special inputs passed in VGPRs.
2260 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2261 SIMachineFunctionInfo &Info) const {
2262 const LLT S32 = LLT::scalar(32);
2264
2265 if (Info.hasWorkItemIDX()) {
2266 Register Reg = AMDGPU::VGPR0;
2267 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2268
2269 CCInfo.AllocateReg(Reg);
2270 unsigned Mask =
2271 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2272 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2273 }
2274
2275 if (Info.hasWorkItemIDY()) {
2276 assert(Info.hasWorkItemIDX());
2277 if (Subtarget->hasPackedTID()) {
2278 Info.setWorkItemIDY(
2279 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2280 } else {
2281 unsigned Reg = AMDGPU::VGPR1;
2282 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2283
2284 CCInfo.AllocateReg(Reg);
2285 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2286 }
2287 }
2288
2289 if (Info.hasWorkItemIDZ()) {
2290 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2291 if (Subtarget->hasPackedTID()) {
2292 Info.setWorkItemIDZ(
2293 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2294 } else {
2295 unsigned Reg = AMDGPU::VGPR2;
2296 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2297
2298 CCInfo.AllocateReg(Reg);
2299 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2300 }
2301 }
2302}
2303
2304// Try to allocate a VGPR at the end of the argument list, or if no argument
2305// VGPRs are left allocating a stack slot.
2306// If \p Mask is is given it indicates bitfield position in the register.
2307// If \p Arg is given use it with new ]p Mask instead of allocating new.
2308static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2309 ArgDescriptor Arg = ArgDescriptor()) {
2310 if (Arg.isSet())
2311 return ArgDescriptor::createArg(Arg, Mask);
2312
2313 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2314 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2315 if (RegIdx == ArgVGPRs.size()) {
2316 // Spill to stack required.
2317 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2318
2319 return ArgDescriptor::createStack(Offset, Mask);
2320 }
2321
2322 unsigned Reg = ArgVGPRs[RegIdx];
2323 Reg = CCInfo.AllocateReg(Reg);
2324 assert(Reg != AMDGPU::NoRegister);
2325
2326 MachineFunction &MF = CCInfo.getMachineFunction();
2327 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2328 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2329 return ArgDescriptor::createRegister(Reg, Mask);
2330}
2331
2333 const TargetRegisterClass *RC,
2334 unsigned NumArgRegs) {
2335 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2336 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2337 if (RegIdx == ArgSGPRs.size())
2338 report_fatal_error("ran out of SGPRs for arguments");
2339
2340 unsigned Reg = ArgSGPRs[RegIdx];
2341 Reg = CCInfo.AllocateReg(Reg);
2342 assert(Reg != AMDGPU::NoRegister);
2343
2344 MachineFunction &MF = CCInfo.getMachineFunction();
2345 MF.addLiveIn(Reg, RC);
2347}
2348
2349// If this has a fixed position, we still should allocate the register in the
2350// CCInfo state. Technically we could get away with this for values passed
2351// outside of the normal argument range.
2353 const TargetRegisterClass *RC,
2354 MCRegister Reg) {
2355 Reg = CCInfo.AllocateReg(Reg);
2356 assert(Reg != AMDGPU::NoRegister);
2357 MachineFunction &MF = CCInfo.getMachineFunction();
2358 MF.addLiveIn(Reg, RC);
2359}
2360
2361static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2362 if (Arg) {
2363 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2364 Arg.getRegister());
2365 } else
2366 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2367}
2368
2369static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2370 if (Arg) {
2371 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2372 Arg.getRegister());
2373 } else
2374 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2375}
2376
2377/// Allocate implicit function VGPR arguments at the end of allocated user
2378/// arguments.
2380 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2381 SIMachineFunctionInfo &Info) const {
2382 const unsigned Mask = 0x3ff;
2383 ArgDescriptor Arg;
2384
2385 if (Info.hasWorkItemIDX()) {
2386 Arg = allocateVGPR32Input(CCInfo, Mask);
2387 Info.setWorkItemIDX(Arg);
2388 }
2389
2390 if (Info.hasWorkItemIDY()) {
2391 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2392 Info.setWorkItemIDY(Arg);
2393 }
2394
2395 if (Info.hasWorkItemIDZ())
2396 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2397}
2398
2399/// Allocate implicit function VGPR arguments in fixed registers.
2401 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2402 SIMachineFunctionInfo &Info) const {
2403 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2404 if (!Reg)
2405 report_fatal_error("failed to allocated VGPR for implicit arguments");
2406
2407 const unsigned Mask = 0x3ff;
2408 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2409 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2410 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2411}
2412
2414 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2415 SIMachineFunctionInfo &Info) const {
2416 auto &ArgInfo = Info.getArgInfo();
2417 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2418
2419 // TODO: Unify handling with private memory pointers.
2420 if (UserSGPRInfo.hasDispatchPtr())
2421 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2422
2423 if (UserSGPRInfo.hasQueuePtr())
2424 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2425
2426 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2427 // constant offset from the kernarg segment.
2428 if (Info.hasImplicitArgPtr())
2429 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2430
2431 if (UserSGPRInfo.hasDispatchID())
2432 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2433
2434 // flat_scratch_init is not applicable for non-kernel functions.
2435
2436 if (Info.hasWorkGroupIDX())
2437 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2438
2439 if (Info.hasWorkGroupIDY())
2440 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2441
2442 if (Info.hasWorkGroupIDZ())
2443 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2444
2445 if (Info.hasLDSKernelId())
2446 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2447}
2448
2449// Allocate special inputs passed in user SGPRs.
2451 MachineFunction &MF,
2452 const SIRegisterInfo &TRI,
2453 SIMachineFunctionInfo &Info) const {
2454 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2455 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2456 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2457 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2458 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2459 }
2460
2461 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2462 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2463 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2464 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2465 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2466 }
2467
2468 if (UserSGPRInfo.hasDispatchPtr()) {
2469 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2470 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2471 CCInfo.AllocateReg(DispatchPtrReg);
2472 }
2473
2474 if (UserSGPRInfo.hasQueuePtr()) {
2475 Register QueuePtrReg = Info.addQueuePtr(TRI);
2476 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2477 CCInfo.AllocateReg(QueuePtrReg);
2478 }
2479
2480 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2482 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2483 CCInfo.AllocateReg(InputPtrReg);
2484
2485 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2486 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2487 }
2488
2489 if (UserSGPRInfo.hasDispatchID()) {
2490 Register DispatchIDReg = Info.addDispatchID(TRI);
2491 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2492 CCInfo.AllocateReg(DispatchIDReg);
2493 }
2494
2495 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2496 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2497 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2498 CCInfo.AllocateReg(FlatScratchInitReg);
2499 }
2500
2501 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2502 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2503 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2504 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2505 }
2506
2507 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2508 // these from the dispatch pointer.
2509}
2510
2511// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2512// sequential starting from the first argument.
2514 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2516 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2517 Function &F = MF.getFunction();
2518 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2519 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2520 bool InPreloadSequence = true;
2521 unsigned InIdx = 0;
2522 bool AlignedForImplictArgs = false;
2523 unsigned ImplicitArgOffset = 0;
2524 for (auto &Arg : F.args()) {
2525 if (!InPreloadSequence || !Arg.hasInRegAttr())
2526 break;
2527
2528 unsigned ArgIdx = Arg.getArgNo();
2529 // Don't preload non-original args or parts not in the current preload
2530 // sequence.
2531 if (InIdx < Ins.size() &&
2532 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2533 break;
2534
2535 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2536 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2537 InIdx++) {
2538 assert(ArgLocs[ArgIdx].isMemLoc());
2539 auto &ArgLoc = ArgLocs[InIdx];
2540 const Align KernelArgBaseAlign = Align(16);
2541 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2542 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2543 unsigned NumAllocSGPRs =
2544 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2545
2546 // Fix alignment for hidden arguments.
2547 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2548 if (!AlignedForImplictArgs) {
2549 ImplicitArgOffset =
2550 alignTo(LastExplicitArgOffset,
2551 Subtarget->getAlignmentForImplicitArgPtr()) -
2552 LastExplicitArgOffset;
2553 AlignedForImplictArgs = true;
2554 }
2555 ArgOffset += ImplicitArgOffset;
2556 }
2557
2558 // Arg is preloaded into the previous SGPR.
2559 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2560 assert(InIdx >= 1 && "No previous SGPR");
2561 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2562 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2563 continue;
2564 }
2565
2566 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2567 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2568 // Check for free user SGPRs for preloading.
2569 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2570 InPreloadSequence = false;
2571 break;
2572 }
2573
2574 // Preload this argument.
2575 const TargetRegisterClass *RC =
2576 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2577 SmallVectorImpl<MCRegister> *PreloadRegs =
2578 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2579
2580 if (PreloadRegs->size() > 1)
2581 RC = &AMDGPU::SGPR_32RegClass;
2582 for (auto &Reg : *PreloadRegs) {
2583 assert(Reg);
2584 MF.addLiveIn(Reg, RC);
2585 CCInfo.AllocateReg(Reg);
2586 }
2587
2588 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2589 }
2590 }
2591}
2592
2594 const SIRegisterInfo &TRI,
2595 SIMachineFunctionInfo &Info) const {
2596 // Always allocate this last since it is a synthetic preload.
2597 if (Info.hasLDSKernelId()) {
2598 Register Reg = Info.addLDSKernelId();
2599 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2600 CCInfo.AllocateReg(Reg);
2601 }
2602}
2603
2604// Allocate special input registers that are initialized per-wave.
2607 CallingConv::ID CallConv,
2608 bool IsShader) const {
2609 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2610 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2611 // Note: user SGPRs are handled by the front-end for graphics shaders
2612 // Pad up the used user SGPRs with dead inputs.
2613
2614 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2615 // before enabling architected SGPRs for workgroup IDs.
2616 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2617
2618 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2619 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2620 // rely on it to reach 16 since if we end up having no stack usage, it will
2621 // not really be added.
2622 unsigned NumRequiredSystemSGPRs =
2623 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2624 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2625 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2626 Register Reg = Info.addReservedUserSGPR();
2627 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2628 CCInfo.AllocateReg(Reg);
2629 }
2630 }
2631
2632 if (!HasArchitectedSGPRs) {
2633 if (Info.hasWorkGroupIDX()) {
2634 Register Reg = Info.addWorkGroupIDX();
2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636 CCInfo.AllocateReg(Reg);
2637 }
2638
2639 if (Info.hasWorkGroupIDY()) {
2640 Register Reg = Info.addWorkGroupIDY();
2641 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2642 CCInfo.AllocateReg(Reg);
2643 }
2644
2645 if (Info.hasWorkGroupIDZ()) {
2646 Register Reg = Info.addWorkGroupIDZ();
2647 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2648 CCInfo.AllocateReg(Reg);
2649 }
2650 }
2651
2652 if (Info.hasWorkGroupInfo()) {
2653 Register Reg = Info.addWorkGroupInfo();
2654 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2655 CCInfo.AllocateReg(Reg);
2656 }
2657
2658 if (Info.hasPrivateSegmentWaveByteOffset()) {
2659 // Scratch wave offset passed in system SGPR.
2660 unsigned PrivateSegmentWaveByteOffsetReg;
2661
2662 if (IsShader) {
2663 PrivateSegmentWaveByteOffsetReg =
2664 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2665
2666 // This is true if the scratch wave byte offset doesn't have a fixed
2667 // location.
2668 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2669 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2670 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2671 }
2672 } else
2673 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2674
2675 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2676 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2677 }
2678
2679 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2680 Info.getNumPreloadedSGPRs() >= 16);
2681}
2682
2684 MachineFunction &MF,
2685 const SIRegisterInfo &TRI,
2686 SIMachineFunctionInfo &Info) {
2687 // Now that we've figured out where the scratch register inputs are, see if
2688 // should reserve the arguments and use them directly.
2689 MachineFrameInfo &MFI = MF.getFrameInfo();
2690 bool HasStackObjects = MFI.hasStackObjects();
2691 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2692
2693 // Record that we know we have non-spill stack objects so we don't need to
2694 // check all stack objects later.
2695 if (HasStackObjects)
2696 Info.setHasNonSpillStackObjects(true);
2697
2698 // Everything live out of a block is spilled with fast regalloc, so it's
2699 // almost certain that spilling will be required.
2700 if (TM.getOptLevel() == CodeGenOptLevel::None)
2701 HasStackObjects = true;
2702
2703 // For now assume stack access is needed in any callee functions, so we need
2704 // the scratch registers to pass in.
2705 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2706
2707 if (!ST.enableFlatScratch()) {
2708 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2709 // If we have stack objects, we unquestionably need the private buffer
2710 // resource. For the Code Object V2 ABI, this will be the first 4 user
2711 // SGPR inputs. We can reserve those and use them directly.
2712
2713 Register PrivateSegmentBufferReg =
2715 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2716 } else {
2717 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2718 // We tentatively reserve the last registers (skipping the last registers
2719 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2720 // we'll replace these with the ones immediately after those which were
2721 // really allocated. In the prologue copies will be inserted from the
2722 // argument to these reserved registers.
2723
2724 // Without HSA, relocations are used for the scratch pointer and the
2725 // buffer resource setup is always inserted in the prologue. Scratch wave
2726 // offset is still in an input SGPR.
2727 Info.setScratchRSrcReg(ReservedBufferReg);
2728 }
2729 }
2730
2732
2733 // For entry functions we have to set up the stack pointer if we use it,
2734 // whereas non-entry functions get this "for free". This means there is no
2735 // intrinsic advantage to using S32 over S34 in cases where we do not have
2736 // calls but do need a frame pointer (i.e. if we are requested to have one
2737 // because frame pointer elimination is disabled). To keep things simple we
2738 // only ever use S32 as the call ABI stack pointer, and so using it does not
2739 // imply we need a separate frame pointer.
2740 //
2741 // Try to use s32 as the SP, but move it if it would interfere with input
2742 // arguments. This won't work with calls though.
2743 //
2744 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2745 // registers.
2746 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2747 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2748 } else {
2750
2751 if (MFI.hasCalls())
2752 report_fatal_error("call in graphics shader with too many input SGPRs");
2753
2754 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2755 if (!MRI.isLiveIn(Reg)) {
2756 Info.setStackPtrOffsetReg(Reg);
2757 break;
2758 }
2759 }
2760
2761 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2762 report_fatal_error("failed to find register for SP");
2763 }
2764
2765 // hasFP should be accurate for entry functions even before the frame is
2766 // finalized, because it does not rely on the known stack size, only
2767 // properties like whether variable sized objects are present.
2768 if (ST.getFrameLowering()->hasFP(MF)) {
2769 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2770 }
2771}
2772
2775 return !Info->isEntryFunction();
2776}
2777
2779
2781 MachineBasicBlock *Entry,
2782 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2784
2785 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2786 if (!IStart)
2787 return;
2788
2789 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2790 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2791 MachineBasicBlock::iterator MBBI = Entry->begin();
2792 for (const MCPhysReg *I = IStart; *I; ++I) {
2793 const TargetRegisterClass *RC = nullptr;
2794 if (AMDGPU::SReg_64RegClass.contains(*I))
2795 RC = &AMDGPU::SGPR_64RegClass;
2796 else if (AMDGPU::SReg_32RegClass.contains(*I))
2797 RC = &AMDGPU::SGPR_32RegClass;
2798 else
2799 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2800
2801 Register NewVR = MRI->createVirtualRegister(RC);
2802 // Create copy from CSR to a virtual register.
2803 Entry->addLiveIn(*I);
2804 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2805 .addReg(*I);
2806
2807 // Insert the copy-back instructions right before the terminator.
2808 for (auto *Exit : Exits)
2809 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2810 TII->get(TargetOpcode::COPY), *I)
2811 .addReg(NewVR);
2812 }
2813}
2814
2816 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2817 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2818 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2820
2822 const Function &Fn = MF.getFunction();
2825
2826 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2827 DiagnosticInfoUnsupported NoGraphicsHSA(
2828 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2829 DAG.getContext()->diagnose(NoGraphicsHSA);
2830 return DAG.getEntryNode();
2831 }
2832
2835 BitVector Skipped(Ins.size());
2836 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2837 *DAG.getContext());
2838
2839 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2840 bool IsKernel = AMDGPU::isKernel(CallConv);
2841 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2842
2843 if (IsGraphics) {
2844 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2845 assert(!UserSGPRInfo.hasDispatchPtr() &&
2846 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2847 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2848 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2849 (void)UserSGPRInfo;
2850 if (!Subtarget->enableFlatScratch())
2851 assert(!UserSGPRInfo.hasFlatScratchInit());
2852 if ((CallConv != CallingConv::AMDGPU_CS &&
2853 CallConv != CallingConv::AMDGPU_Gfx) ||
2854 !Subtarget->hasArchitectedSGPRs())
2855 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2856 !Info->hasWorkGroupIDZ());
2857 }
2858
2859 if (CallConv == CallingConv::AMDGPU_PS) {
2860 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2861
2862 // At least one interpolation mode must be enabled or else the GPU will
2863 // hang.
2864 //
2865 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2866 // set PSInputAddr, the user wants to enable some bits after the compilation
2867 // based on run-time states. Since we can't know what the final PSInputEna
2868 // will look like, so we shouldn't do anything here and the user should take
2869 // responsibility for the correct programming.
2870 //
2871 // Otherwise, the following restrictions apply:
2872 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2873 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2874 // enabled too.
2875 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2876 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2877 CCInfo.AllocateReg(AMDGPU::VGPR0);
2878 CCInfo.AllocateReg(AMDGPU::VGPR1);
2879 Info->markPSInputAllocated(0);
2880 Info->markPSInputEnabled(0);
2881 }
2882 if (Subtarget->isAmdPalOS()) {
2883 // For isAmdPalOS, the user does not enable some bits after compilation
2884 // based on run-time states; the register values being generated here are
2885 // the final ones set in hardware. Therefore we need to apply the
2886 // workaround to PSInputAddr and PSInputEnable together. (The case where
2887 // a bit is set in PSInputAddr but not PSInputEnable is where the
2888 // frontend set up an input arg for a particular interpolation mode, but
2889 // nothing uses that input arg. Really we should have an earlier pass
2890 // that removes such an arg.)
2891 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2892 if ((PsInputBits & 0x7F) == 0 ||
2893 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2894 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2895 }
2896 } else if (IsKernel) {
2897 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2898 } else {
2899 Splits.append(Ins.begin(), Ins.end());
2900 }
2901
2902 if (IsKernel)
2903 analyzeFormalArgumentsCompute(CCInfo, Ins);
2904
2905 if (IsEntryFunc) {
2906 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2907 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2908 if (IsKernel && Subtarget->hasKernargPreload())
2909 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2910
2911 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2912 } else if (!IsGraphics) {
2913 // For the fixed ABI, pass workitem IDs in the last argument register.
2914 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2915
2916 // FIXME: Sink this into allocateSpecialInputSGPRs
2917 if (!Subtarget->enableFlatScratch())
2918 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2919
2920 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2921 }
2922
2923 if (!IsKernel) {
2924 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2925 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2926 }
2927
2929
2930 // FIXME: This is the minimum kernel argument alignment. We should improve
2931 // this to the maximum alignment of the arguments.
2932 //
2933 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2934 // kern arg offset.
2935 const Align KernelArgBaseAlign = Align(16);
2936
2937 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2938 const ISD::InputArg &Arg = Ins[i];
2939 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2940 InVals.push_back(DAG.getUNDEF(Arg.VT));
2941 continue;
2942 }
2943
2944 CCValAssign &VA = ArgLocs[ArgIdx++];
2945 MVT VT = VA.getLocVT();
2946
2947 if (IsEntryFunc && VA.isMemLoc()) {
2948 VT = Ins[i].VT;
2949 EVT MemVT = VA.getLocVT();
2950
2951 const uint64_t Offset = VA.getLocMemOffset();
2952 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2953
2954 if (Arg.Flags.isByRef()) {
2955 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2956
2957 const GCNTargetMachine &TM =
2958 static_cast<const GCNTargetMachine &>(getTargetMachine());
2959 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2960 Arg.Flags.getPointerAddrSpace())) {
2963 }
2964
2965 InVals.push_back(Ptr);
2966 continue;
2967 }
2968
2969 SDValue NewArg;
2970 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2971 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2972 // In this case the argument is packed into the previous preload SGPR.
2973 int64_t AlignDownOffset = alignDown(Offset, 4);
2974 int64_t OffsetDiff = Offset - AlignDownOffset;
2975 EVT IntVT = MemVT.changeTypeToInteger();
2976
2980 Register Reg =
2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2982
2983 assert(Reg);
2984 Register VReg = MRI.getLiveInVirtReg(Reg);
2985 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2986
2987 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2988 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2989
2990 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2991 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2992 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2993 Ins[i].Flags.isSExt(), &Ins[i]);
2994
2995 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2996 } else {
3000 const SmallVectorImpl<MCRegister> &PreloadRegs =
3001 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3002
3003 SDValue Copy;
3004 if (PreloadRegs.size() == 1) {
3005 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3006 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3007 NewArg = DAG.getCopyFromReg(
3008 Chain, DL, VReg,
3010 TRI->getRegSizeInBits(*RC)));
3011
3012 } else {
3013 // If the kernarg alignment does not match the alignment of the SGPR
3014 // tuple RC that can accommodate this argument, it will be built up
3015 // via copies from from the individual SGPRs that the argument was
3016 // preloaded to.
3018 for (auto Reg : PreloadRegs) {
3019 Register VReg = MRI.getLiveInVirtReg(Reg);
3020 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3021 Elts.push_back(Copy);
3022 }
3023 NewArg =
3024 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3025 PreloadRegs.size()),
3026 DL, Elts);
3027 }
3028
3029 // If the argument was preloaded to multiple consecutive 32-bit
3030 // registers because of misalignment between addressable SGPR tuples
3031 // and the argument size, we can still assume that because of kernarg
3032 // segment alignment restrictions that NewArg's size is the same as
3033 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3034 // truncate since we cannot preload to less than a single SGPR and the
3035 // MemVT may be smaller.
3036 EVT MemVTInt =
3038 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3039 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3040
3041 NewArg = DAG.getBitcast(MemVT, NewArg);
3042 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3043 Ins[i].Flags.isSExt(), &Ins[i]);
3044 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3045 }
3046 } else {
3047 // Hidden arguments that are in the kernel signature must be preloaded
3048 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3049 // the argument list and is not preloaded.
3050 if (Arg.isOrigArg()) {
3051 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3052 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3053 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3054 *OrigArg->getParent(),
3055 "hidden argument in kernel signature was not preloaded",
3056 DL.getDebugLoc());
3057 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3058 }
3059 }
3060
3061 NewArg =
3062 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3063 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3064 }
3065 Chains.push_back(NewArg.getValue(1));
3066
3067 auto *ParamTy =
3068 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3070 ParamTy &&
3071 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3072 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3073 // On SI local pointers are just offsets into LDS, so they are always
3074 // less than 16-bits. On CI and newer they could potentially be
3075 // real pointers, so we can't guarantee their size.
3076 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3077 DAG.getValueType(MVT::i16));
3078 }
3079
3080 InVals.push_back(NewArg);
3081 continue;
3082 }
3083 if (!IsEntryFunc && VA.isMemLoc()) {
3084 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3085 InVals.push_back(Val);
3086 if (!Arg.Flags.isByVal())
3087 Chains.push_back(Val.getValue(1));
3088 continue;
3089 }
3090
3091 assert(VA.isRegLoc() && "Parameter must be in a register!");
3092
3093 Register Reg = VA.getLocReg();
3094 const TargetRegisterClass *RC = nullptr;
3095 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3096 RC = &AMDGPU::VGPR_32RegClass;
3097 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3098 RC = &AMDGPU::SGPR_32RegClass;
3099 else
3100 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3101 EVT ValVT = VA.getValVT();
3102
3103 Reg = MF.addLiveIn(Reg, RC);
3104 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3105
3106 if (Arg.Flags.isSRet()) {
3107 // The return object should be reasonably addressable.
3108
3109 // FIXME: This helps when the return is a real sret. If it is a
3110 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3111 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3112 unsigned NumBits =
3114 Val = DAG.getNode(
3115 ISD::AssertZext, DL, VT, Val,
3116 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3117 }
3118
3119 // If this is an 8 or 16-bit value, it is really passed promoted
3120 // to 32 bits. Insert an assert[sz]ext to capture this, then
3121 // truncate to the right size.
3122 switch (VA.getLocInfo()) {
3123 case CCValAssign::Full:
3124 break;
3125 case CCValAssign::BCvt:
3126 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3127 break;
3128 case CCValAssign::SExt:
3129 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3130 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3131 break;
3132 case CCValAssign::ZExt:
3133 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3134 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3135 break;
3136 case CCValAssign::AExt:
3137 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3138 break;
3139 default:
3140 llvm_unreachable("Unknown loc info!");
3141 }
3142
3143 InVals.push_back(Val);
3144 }
3145
3146 // Start adding system SGPRs.
3147 if (IsEntryFunc)
3148 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3149
3150 // DAG.getPass() returns nullptr when using new pass manager.
3151 // TODO: Use DAG.getMFAM() to access analysis result.
3152 if (DAG.getPass()) {
3153 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3154 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3155 }
3156
3157 unsigned StackArgSize = CCInfo.getStackSize();
3158 Info->setBytesInStackArgArea(StackArgSize);
3159
3160 return Chains.empty() ? Chain
3161 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3162}
3163
3164// TODO: If return values can't fit in registers, we should return as many as
3165// possible in registers before passing on stack.
3167 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3168 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3169 const Type *RetTy) const {
3170 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3171 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3172 // for shaders. Vector types should be explicitly handled by CC.
3173 if (AMDGPU::isEntryFunctionCC(CallConv))
3174 return true;
3175
3177 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3178 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3179 return false;
3180
3181 // We must use the stack if return would require unavailable registers.
3182 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3183 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3184 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3185 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3186 return false;
3187
3188 return true;
3189}
3190
3191SDValue
3193 bool isVarArg,
3195 const SmallVectorImpl<SDValue> &OutVals,
3196 const SDLoc &DL, SelectionDAG &DAG) const {
3199
3200 if (AMDGPU::isKernel(CallConv)) {
3201 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3202 OutVals, DL, DAG);
3203 }
3204
3205 bool IsShader = AMDGPU::isShader(CallConv);
3206
3207 Info->setIfReturnsVoid(Outs.empty());
3208 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3209
3210 // CCValAssign - represent the assignment of the return value to a location.
3213
3214 // CCState - Info about the registers and stack slots.
3215 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3216 *DAG.getContext());
3217
3218 // Analyze outgoing return values.
3219 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3220
3221 SDValue Glue;
3223 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3224
3225 // Copy the result values into the output registers.
3226 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3227 ++I, ++RealRVLocIdx) {
3228 CCValAssign &VA = RVLocs[I];
3229 assert(VA.isRegLoc() && "Can only return in registers!");
3230 // TODO: Partially return in registers if return values don't fit.
3231 SDValue Arg = OutVals[RealRVLocIdx];
3232
3233 // Copied from other backends.
3234 switch (VA.getLocInfo()) {
3235 case CCValAssign::Full:
3236 break;
3237 case CCValAssign::BCvt:
3238 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3239 break;
3240 case CCValAssign::SExt:
3241 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3242 break;
3243 case CCValAssign::ZExt:
3244 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3245 break;
3246 case CCValAssign::AExt:
3247 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3248 break;
3249 default:
3250 llvm_unreachable("Unknown loc info!");
3251 }
3252
3253 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3254 Glue = Chain.getValue(1);
3255 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3256 }
3257
3258 // FIXME: Does sret work properly?
3259 if (!Info->isEntryFunction()) {
3260 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3261 const MCPhysReg *I =
3262 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3263 if (I) {
3264 for (; *I; ++I) {
3265 if (AMDGPU::SReg_64RegClass.contains(*I))
3266 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3267 else if (AMDGPU::SReg_32RegClass.contains(*I))
3268 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3269 else
3270 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3271 }
3272 }
3273 }
3274
3275 // Update chain and glue.
3276 RetOps[0] = Chain;
3277 if (Glue.getNode())
3278 RetOps.push_back(Glue);
3279
3280 unsigned Opc = AMDGPUISD::ENDPGM;
3281 if (!IsWaveEnd)
3283 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3284}
3285
3287 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3288 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3289 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3290 SDValue ThisVal) const {
3291 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3292
3293 // Assign locations to each value returned by this call.
3295 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3296 *DAG.getContext());
3297 CCInfo.AnalyzeCallResult(Ins, RetCC);
3298
3299 // Copy all of the result registers out of their specified physreg.
3300 for (CCValAssign VA : RVLocs) {
3301 SDValue Val;
3302
3303 if (VA.isRegLoc()) {
3304 Val =
3305 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3306 Chain = Val.getValue(1);
3307 InGlue = Val.getValue(2);
3308 } else if (VA.isMemLoc()) {
3309 report_fatal_error("TODO: return values in memory");
3310 } else
3311 llvm_unreachable("unknown argument location type");
3312
3313 switch (VA.getLocInfo()) {
3314 case CCValAssign::Full:
3315 break;
3316 case CCValAssign::BCvt:
3317 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3318 break;
3319 case CCValAssign::ZExt:
3320 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3321 DAG.getValueType(VA.getValVT()));
3322 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3323 break;
3324 case CCValAssign::SExt:
3325 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3326 DAG.getValueType(VA.getValVT()));
3327 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3328 break;
3329 case CCValAssign::AExt:
3330 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3331 break;
3332 default:
3333 llvm_unreachable("Unknown loc info!");
3334 }
3335
3336 InVals.push_back(Val);
3337 }
3338
3339 return Chain;
3340}
3341
3342// Add code to pass special inputs required depending on used features separate
3343// from the explicit user arguments present in the IR.
3345 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3346 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3347 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3348 // If we don't have a call site, this was a call inserted by
3349 // legalization. These can never use special inputs.
3350 if (!CLI.CB)
3351 return;
3352
3353 SelectionDAG &DAG = CLI.DAG;
3354 const SDLoc &DL = CLI.DL;
3355 const Function &F = DAG.getMachineFunction().getFunction();
3356
3357 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3358 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3359
3360 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3362 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3363 // DAG.getPass() returns nullptr when using new pass manager.
3364 // TODO: Use DAG.getMFAM() to access analysis result.
3365 if (DAG.getPass()) {
3366 auto &ArgUsageInfo =
3368 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3369 }
3370 }
3371
3372 // TODO: Unify with private memory register handling. This is complicated by
3373 // the fact that at least in kernels, the input argument is not necessarily
3374 // in the same location as the input.
3375 // clang-format off
3376 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3378 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3379 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3380 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3381 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3382 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3383 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3384 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3385 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3386 };
3387 // clang-format on
3388
3389 for (auto [InputID, Attr] : ImplicitAttrs) {
3390 // If the callee does not use the attribute value, skip copying the value.
3391 if (CLI.CB->hasFnAttr(Attr))
3392 continue;
3393
3394 const auto [OutgoingArg, ArgRC, ArgTy] =
3395 CalleeArgInfo->getPreloadedValue(InputID);
3396 if (!OutgoingArg)
3397 continue;
3398
3399 const auto [IncomingArg, IncomingArgRC, Ty] =
3400 CallerArgInfo.getPreloadedValue(InputID);
3401 assert(IncomingArgRC == ArgRC);
3402
3403 // All special arguments are ints for now.
3404 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3405 SDValue InputReg;
3406
3407 if (IncomingArg) {
3408 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3409 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3410 // The implicit arg ptr is special because it doesn't have a corresponding
3411 // input for kernels, and is computed from the kernarg segment pointer.
3412 InputReg = getImplicitArgPtr(DAG, DL);
3413 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3414 std::optional<uint32_t> Id =
3416 if (Id.has_value()) {
3417 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3418 } else {
3419 InputReg = DAG.getUNDEF(ArgVT);
3420 }
3421 } else {
3422 // We may have proven the input wasn't needed, although the ABI is
3423 // requiring it. We just need to allocate the register appropriately.
3424 InputReg = DAG.getUNDEF(ArgVT);
3425 }
3426
3427 if (OutgoingArg->isRegister()) {
3428 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3429 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3430 report_fatal_error("failed to allocate implicit input argument");
3431 } else {
3432 unsigned SpecialArgOffset =
3433 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3434 SDValue ArgStore =
3435 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3436 MemOpChains.push_back(ArgStore);
3437 }
3438 }
3439
3440 // Pack workitem IDs into a single register or pass it as is if already
3441 // packed.
3442
3443 auto [OutgoingArg, ArgRC, Ty] =
3445 if (!OutgoingArg)
3446 std::tie(OutgoingArg, ArgRC, Ty) =
3448 if (!OutgoingArg)
3449 std::tie(OutgoingArg, ArgRC, Ty) =
3451 if (!OutgoingArg)
3452 return;
3453
3454 const ArgDescriptor *IncomingArgX = std::get<0>(
3456 const ArgDescriptor *IncomingArgY = std::get<0>(
3458 const ArgDescriptor *IncomingArgZ = std::get<0>(
3460
3461 SDValue InputReg;
3462 SDLoc SL;
3463
3464 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3465 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3466 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3467
3468 // If incoming ids are not packed we need to pack them.
3469 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3470 NeedWorkItemIDX) {
3471 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3472 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3473 } else {
3474 InputReg = DAG.getConstant(0, DL, MVT::i32);
3475 }
3476 }
3477
3478 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3479 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3480 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3481 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3482 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3483 InputReg = InputReg.getNode()
3484 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3485 : Y;
3486 }
3487
3488 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3489 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3490 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3491 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3492 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3493 InputReg = InputReg.getNode()
3494 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3495 : Z;
3496 }
3497
3498 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3499 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3500 // We're in a situation where the outgoing function requires the workitem
3501 // ID, but the calling function does not have it (e.g a graphics function
3502 // calling a C calling convention function). This is illegal, but we need
3503 // to produce something.
3504 InputReg = DAG.getUNDEF(MVT::i32);
3505 } else {
3506 // Workitem ids are already packed, any of present incoming arguments
3507 // will carry all required fields.
3508 ArgDescriptor IncomingArg =
3509 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3510 : IncomingArgY ? *IncomingArgY
3511 : *IncomingArgZ,
3512 ~0u);
3513 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3514 }
3515 }
3516
3517 if (OutgoingArg->isRegister()) {
3518 if (InputReg)
3519 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3520
3521 CCInfo.AllocateReg(OutgoingArg->getRegister());
3522 } else {
3523 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3524 if (InputReg) {
3525 SDValue ArgStore =
3526 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3527 MemOpChains.push_back(ArgStore);
3528 }
3529 }
3530}
3531
3533 return CC == CallingConv::Fast;
3534}
3535
3536/// Return true if we might ever do TCO for calls with this calling convention.
3538 switch (CC) {
3539 case CallingConv::C:
3541 return true;
3542 default:
3543 return canGuaranteeTCO(CC);
3544 }
3545}
3546
3548 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3550 const SmallVectorImpl<SDValue> &OutVals,
3551 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3552 if (AMDGPU::isChainCC(CalleeCC))
3553 return true;
3554
3555 if (!mayTailCallThisCC(CalleeCC))
3556 return false;
3557
3558 // For a divergent call target, we need to do a waterfall loop over the
3559 // possible callees which precludes us from using a simple jump.
3560 if (Callee->isDivergent())
3561 return false;
3562
3564 const Function &CallerF = MF.getFunction();
3565 CallingConv::ID CallerCC = CallerF.getCallingConv();
3567 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3568
3569 // Kernels aren't callable, and don't have a live in return address so it
3570 // doesn't make sense to do a tail call with entry functions.
3571 if (!CallerPreserved)
3572 return false;
3573
3574 bool CCMatch = CallerCC == CalleeCC;
3575
3577 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3578 return true;
3579 return false;
3580 }
3581
3582 // TODO: Can we handle var args?
3583 if (IsVarArg)
3584 return false;
3585
3586 for (const Argument &Arg : CallerF.args()) {
3587 if (Arg.hasByValAttr())
3588 return false;
3589 }
3590
3591 LLVMContext &Ctx = *DAG.getContext();
3592
3593 // Check that the call results are passed in the same way.
3594 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3595 CCAssignFnForCall(CalleeCC, IsVarArg),
3596 CCAssignFnForCall(CallerCC, IsVarArg)))
3597 return false;
3598
3599 // The callee has to preserve all registers the caller needs to preserve.
3600 if (!CCMatch) {
3601 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3602 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3603 return false;
3604 }
3605
3606 // Nothing more to check if the callee is taking no arguments.
3607 if (Outs.empty())
3608 return true;
3609
3611 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3612
3613 // FIXME: We are not allocating special input registers, so we will be
3614 // deciding based on incorrect register assignments.
3615 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3616
3617 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3618 // If the stack arguments for this call do not fit into our own save area then
3619 // the call cannot be made tail.
3620 // TODO: Is this really necessary?
3621 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3622 return false;
3623
3624 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3625 // FIXME: What about inreg arguments that end up passed in memory?
3626 if (!CCVA.isRegLoc())
3627 continue;
3628
3629 // If we are passing an argument in an SGPR, and the value is divergent,
3630 // this call requires a waterfall loop.
3631 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3632 LLVM_DEBUG(
3633 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3634 << printReg(CCVA.getLocReg(), TRI) << '\n');
3635 return false;
3636 }
3637 }
3638
3639 const MachineRegisterInfo &MRI = MF.getRegInfo();
3640 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3641}
3642
3644 if (!CI->isTailCall())
3645 return false;
3646
3647 const Function *ParentFn = CI->getParent()->getParent();
3649 return false;
3650 return true;
3651}
3652
3653// The wave scratch offset register is used as the global base pointer.
3655 SmallVectorImpl<SDValue> &InVals) const {
3656 CallingConv::ID CallConv = CLI.CallConv;
3657 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3658
3659 SelectionDAG &DAG = CLI.DAG;
3660
3661 TargetLowering::ArgListEntry RequestedExec;
3662 if (IsChainCallConv) {
3663 // The last argument should be the value that we need to put in EXEC.
3664 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3665 // don't treat it like the rest of the arguments.
3666 RequestedExec = CLI.Args.back();
3667 assert(RequestedExec.Node && "No node for EXEC");
3668
3669 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3670 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3671
3672 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3673 CLI.Outs.pop_back();
3674 CLI.OutVals.pop_back();
3675
3676 if (RequestedExec.Ty->isIntegerTy(64)) {
3677 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3678 CLI.Outs.pop_back();
3679 CLI.OutVals.pop_back();
3680 }
3681
3682 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3683 "Haven't popped all the pieces of the EXEC mask");
3684 }
3685
3686 const SDLoc &DL = CLI.DL;
3688 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3690 SDValue Chain = CLI.Chain;
3691 SDValue Callee = CLI.Callee;
3692 bool &IsTailCall = CLI.IsTailCall;
3693 bool IsVarArg = CLI.IsVarArg;
3694 bool IsSibCall = false;
3696
3697 if (Callee.isUndef() || isNullConstant(Callee)) {
3698 if (!CLI.IsTailCall) {
3699 for (ISD::InputArg &Arg : CLI.Ins)
3700 InVals.push_back(DAG.getUNDEF(Arg.VT));
3701 }
3702
3703 return Chain;
3704 }
3705
3706 if (IsVarArg) {
3707 return lowerUnhandledCall(CLI, InVals,
3708 "unsupported call to variadic function ");
3709 }
3710
3711 if (!CLI.CB)
3712 report_fatal_error("unsupported libcall legalization");
3713
3714 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3715 return lowerUnhandledCall(CLI, InVals,
3716 "unsupported required tail call to function ");
3717 }
3718
3719 if (IsTailCall) {
3720 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3721 Outs, OutVals, Ins, DAG);
3722 if (!IsTailCall &&
3723 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3724 report_fatal_error("failed to perform tail call elimination on a call "
3725 "site marked musttail or on llvm.amdgcn.cs.chain");
3726 }
3727
3728 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3729
3730 // A sibling call is one where we're under the usual C ABI and not planning
3731 // to change that but can still do a tail call:
3732 if (!TailCallOpt && IsTailCall)
3733 IsSibCall = true;
3734
3735 if (IsTailCall)
3736 ++NumTailCalls;
3737 }
3738
3741 SmallVector<SDValue, 8> MemOpChains;
3742
3743 // Analyze operands of the call, assigning locations to each operand.
3745 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3746 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3747
3748 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3749 // With a fixed ABI, allocate fixed registers before user arguments.
3750 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3751 }
3752
3753 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3754
3755 // Get a count of how many bytes are to be pushed on the stack.
3756 unsigned NumBytes = CCInfo.getStackSize();
3757
3758 if (IsSibCall) {
3759 // Since we're not changing the ABI to make this a tail call, the memory
3760 // operands are already available in the caller's incoming argument space.
3761 NumBytes = 0;
3762 }
3763
3764 // FPDiff is the byte offset of the call's argument area from the callee's.
3765 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3766 // by this amount for a tail call. In a sibling call it must be 0 because the
3767 // caller will deallocate the entire stack and the callee still expects its
3768 // arguments to begin at SP+0. Completely unused for non-tail calls.
3769 int32_t FPDiff = 0;
3770 MachineFrameInfo &MFI = MF.getFrameInfo();
3771 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3772
3773 // Adjust the stack pointer for the new arguments...
3774 // These operations are automatically eliminated by the prolog/epilog pass
3775 if (!IsSibCall)
3776 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3777
3778 if (!IsSibCall || IsChainCallConv) {
3779 if (!Subtarget->enableFlatScratch()) {
3780 SmallVector<SDValue, 4> CopyFromChains;
3781
3782 // In the HSA case, this should be an identity copy.
3783 SDValue ScratchRSrcReg =
3784 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3785 RegsToPass.emplace_back(IsChainCallConv
3786 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3787 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3788 ScratchRSrcReg);
3789 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3790 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3791 }
3792 }
3793
3794 const unsigned NumSpecialInputs = RegsToPass.size();
3795
3796 MVT PtrVT = MVT::i32;
3797
3798 // Walk the register/memloc assignments, inserting copies/loads.
3799 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3800 CCValAssign &VA = ArgLocs[i];
3801 SDValue Arg = OutVals[i];
3802
3803 // Promote the value if needed.
3804 switch (VA.getLocInfo()) {
3805 case CCValAssign::Full:
3806 break;
3807 case CCValAssign::BCvt:
3808 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3809 break;
3810 case CCValAssign::ZExt:
3811 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3812 break;
3813 case CCValAssign::SExt:
3814 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3815 break;
3816 case CCValAssign::AExt:
3817 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3818 break;
3819 case CCValAssign::FPExt:
3820 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3821 break;
3822 default:
3823 llvm_unreachable("Unknown loc info!");
3824 }
3825
3826 if (VA.isRegLoc()) {
3827 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3828 } else {
3829 assert(VA.isMemLoc());
3830
3831 SDValue DstAddr;
3832 MachinePointerInfo DstInfo;
3833
3834 unsigned LocMemOffset = VA.getLocMemOffset();
3835 int32_t Offset = LocMemOffset;
3836
3837 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3838 MaybeAlign Alignment;
3839
3840 if (IsTailCall) {
3841 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3842 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3843 : VA.getValVT().getStoreSize();
3844
3845 // FIXME: We can have better than the minimum byval required alignment.
3846 Alignment =
3847 Flags.isByVal()
3848 ? Flags.getNonZeroByValAlign()
3849 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3850
3851 Offset = Offset + FPDiff;
3852 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3853
3854 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3855 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3856
3857 // Make sure any stack arguments overlapping with where we're storing
3858 // are loaded before this eventual operation. Otherwise they'll be
3859 // clobbered.
3860
3861 // FIXME: Why is this really necessary? This seems to just result in a
3862 // lot of code to copy the stack and write them back to the same
3863 // locations, which are supposed to be immutable?
3864 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3865 } else {
3866 // Stores to the argument stack area are relative to the stack pointer.
3867 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3868 MVT::i32);
3869 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3870 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3871 Alignment =
3872 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3873 }
3874
3875 if (Outs[i].Flags.isByVal()) {
3876 SDValue SizeNode =
3877 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3878 SDValue Cpy =
3879 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3880 Outs[i].Flags.getNonZeroByValAlign(),
3881 /*isVol = */ false, /*AlwaysInline = */ true,
3882 /*CI=*/nullptr, std::nullopt, DstInfo,
3884
3885 MemOpChains.push_back(Cpy);
3886 } else {
3887 SDValue Store =
3888 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3889 MemOpChains.push_back(Store);
3890 }
3891 }
3892 }
3893
3894 if (!MemOpChains.empty())
3895 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3896
3897 SDValue ReadFirstLaneID =
3898 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3899
3900 SDValue TokenGlue;
3901 if (CLI.ConvergenceControlToken) {
3902 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3904 }
3905
3906 // Build a sequence of copy-to-reg nodes chained together with token chain
3907 // and flag operands which copy the outgoing args into the appropriate regs.
3908 SDValue InGlue;
3909
3910 unsigned ArgIdx = 0;
3911 for (auto [Reg, Val] : RegsToPass) {
3912 if (ArgIdx++ >= NumSpecialInputs &&
3913 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3914 // For chain calls, the inreg arguments are required to be
3915 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3916 // they are uniform.
3917 //
3918 // For other calls, if an inreg arguments is known to be uniform,
3919 // speculatively insert a readfirstlane in case it is in a VGPR.
3920 //
3921 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3922 // value, so let that continue to produce invalid code.
3923
3924 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3925 if (TokenGlue)
3926 ReadfirstlaneArgs.push_back(TokenGlue);
3928 ReadfirstlaneArgs);
3929 }
3930
3931 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3932 InGlue = Chain.getValue(1);
3933 }
3934
3935 // We don't usually want to end the call-sequence here because we would tidy
3936 // the frame up *after* the call, however in the ABI-changing tail-call case
3937 // we've carefully laid out the parameters so that when sp is reset they'll be
3938 // in the correct location.
3939 if (IsTailCall && !IsSibCall) {
3940 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3941 InGlue = Chain.getValue(1);
3942 }
3943
3944 std::vector<SDValue> Ops({Chain});
3945
3946 // Add a redundant copy of the callee global which will not be legalized, as
3947 // we need direct access to the callee later.
3948 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3949 const GlobalValue *GV = GSD->getGlobal();
3950 Ops.push_back(Callee);
3951 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3952 } else {
3953 if (IsTailCall) {
3954 // isEligibleForTailCallOptimization considered whether the call target is
3955 // divergent, but we may still end up with a uniform value in a VGPR.
3956 // Insert a readfirstlane just in case.
3957 SDValue ReadFirstLaneID =
3958 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3959
3960 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3961 if (TokenGlue)
3962 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3963 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3964 ReadfirstlaneArgs);
3965 }
3966
3967 Ops.push_back(Callee);
3968 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3969 }
3970
3971 if (IsTailCall) {
3972 // Each tail call may have to adjust the stack by a different amount, so
3973 // this information must travel along with the operation for eventual
3974 // consumption by emitEpilogue.
3975 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3976 }
3977
3978 if (IsChainCallConv)
3979 Ops.push_back(RequestedExec.Node);
3980
3981 // Add argument registers to the end of the list so that they are known live
3982 // into the call.
3983 for (auto &[Reg, Val] : RegsToPass)
3984 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3985
3986 // Add a register mask operand representing the call-preserved registers.
3987 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3988 assert(Mask && "Missing call preserved mask for calling convention");
3989 Ops.push_back(DAG.getRegisterMask(Mask));
3990
3991 if (SDValue Token = CLI.ConvergenceControlToken) {
3993 GlueOps.push_back(Token);
3994 if (InGlue)
3995 GlueOps.push_back(InGlue);
3996
3997 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3998 MVT::Glue, GlueOps),
3999 0);
4000 }
4001
4002 if (InGlue)
4003 Ops.push_back(InGlue);
4004
4005 // If we're doing a tall call, use a TC_RETURN here rather than an
4006 // actual call instruction.
4007 if (IsTailCall) {
4008 MFI.setHasTailCall();
4009 unsigned OPC = AMDGPUISD::TC_RETURN;
4010 switch (CallConv) {
4013 break;
4017 break;
4018 }
4019
4020 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4021 }
4022
4023 // Returns a chain and a flag for retval copy to use.
4024 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4025 Chain = Call.getValue(0);
4026 InGlue = Call.getValue(1);
4027
4028 uint64_t CalleePopBytes = NumBytes;
4029 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4030 if (!Ins.empty())
4031 InGlue = Chain.getValue(1);
4032
4033 // Handle result values, copying them out of physregs into vregs that we
4034 // return.
4035 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4036 InVals, /*IsThisReturn=*/false, SDValue());
4037}
4038
4039// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4040// except for:
4041// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4042// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4044 SelectionDAG &DAG) const {
4045 const MachineFunction &MF = DAG.getMachineFunction();
4047
4048 SDLoc dl(Op);
4049 EVT VT = Op.getValueType();
4050 SDValue Chain = Op.getOperand(0);
4051 Register SPReg = Info->getStackPtrOffsetReg();
4052
4053 // Chain the dynamic stack allocation so that it doesn't modify the stack
4054 // pointer when other instructions are using the stack.
4055 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4056
4057 SDValue Size = Op.getOperand(1);
4058 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4059 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4060
4061 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4063 "Stack grows upwards for AMDGPU");
4064
4065 Chain = BaseAddr.getValue(1);
4066 Align StackAlign = TFL->getStackAlign();
4067 if (Alignment > StackAlign) {
4068 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4069 << Subtarget->getWavefrontSizeLog2();
4070 uint64_t StackAlignMask = ScaledAlignment - 1;
4071 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4072 DAG.getConstant(StackAlignMask, dl, VT));
4073 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4074 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4075 }
4076
4077 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4078 SDValue NewSP;
4079 if (isa<ConstantSDNode>(Size)) {
4080 // For constant sized alloca, scale alloca size by wave-size
4081 SDValue ScaledSize = DAG.getNode(
4082 ISD::SHL, dl, VT, Size,
4083 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4084 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4085 } else {
4086 // For dynamic sized alloca, perform wave-wide reduction to get max of
4087 // alloca size(divergent) and then scale it by wave-size
4088 SDValue WaveReduction =
4089 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4090 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4091 Size, DAG.getConstant(0, dl, MVT::i32));
4092 SDValue ScaledSize = DAG.getNode(
4093 ISD::SHL, dl, VT, Size,
4094 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4095 NewSP =
4096 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4097 SDValue ReadFirstLaneID =
4098 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4099 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4100 NewSP);
4101 }
4102
4103 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4104 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4105
4106 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4107}
4108
4110 if (Op.getValueType() != MVT::i32)
4111 return Op; // Defer to cannot select error.
4112
4114 SDLoc SL(Op);
4115
4116 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4117
4118 // Convert from wave uniform to swizzled vector address. This should protect
4119 // from any edge cases where the stacksave result isn't directly used with
4120 // stackrestore.
4121 SDValue VectorAddress =
4122 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4123 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4124}
4125
4127 SelectionDAG &DAG) const {
4128 SDLoc SL(Op);
4129 assert(Op.getValueType() == MVT::i32);
4130
4131 uint32_t BothRoundHwReg =
4133 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4134
4135 SDValue IntrinID =
4136 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4137 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4138 Op.getOperand(0), IntrinID, GetRoundBothImm);
4139
4140 // There are two rounding modes, one for f32 and one for f64/f16. We only
4141 // report in the standard value range if both are the same.
4142 //
4143 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4144 // ties away from zero is not supported, and the other values are rotated by
4145 // 1.
4146 //
4147 // If the two rounding modes are not the same, report a target defined value.
4148
4149 // Mode register rounding mode fields:
4150 //
4151 // [1:0] Single-precision round mode.
4152 // [3:2] Double/Half-precision round mode.
4153 //
4154 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4155 //
4156 // Hardware Spec
4157 // Toward-0 3 0
4158 // Nearest Even 0 1
4159 // +Inf 1 2
4160 // -Inf 2 3
4161 // NearestAway0 N/A 4
4162 //
4163 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4164 // table we can index by the raw hardware mode.
4165 //
4166 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4167
4168 SDValue BitTable =
4170
4171 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4172 SDValue RoundModeTimesNumBits =
4173 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4174
4175 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4176 // knew only one mode was demanded.
4177 SDValue TableValue =
4178 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4179 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4180
4181 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4182 SDValue TableEntry =
4183 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4184
4185 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4186 // if it's an extended value.
4187 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4188 SDValue IsStandardValue =
4189 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4190 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4191 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4192 TableEntry, EnumOffset);
4193
4194 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4195}
4196
4198 SelectionDAG &DAG) const {
4199 SDLoc SL(Op);
4200
4201 SDValue NewMode = Op.getOperand(1);
4202 assert(NewMode.getValueType() == MVT::i32);
4203
4204 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4205 // hardware MODE.fp_round values.
4206 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4207 uint32_t ClampedVal = std::min(
4208 static_cast<uint32_t>(ConstMode->getZExtValue()),
4210 NewMode = DAG.getConstant(
4211 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4212 } else {
4213 // If we know the input can only be one of the supported standard modes in
4214 // the range 0-3, we can use a simplified mapping to hardware values.
4215 KnownBits KB = DAG.computeKnownBits(NewMode);
4216 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4217 // The supported standard values are 0-3. The extended values start at 8. We
4218 // need to offset by 4 if the value is in the extended range.
4219
4220 if (UseReducedTable) {
4221 // Truncate to the low 32-bits.
4222 SDValue BitTable = DAG.getConstant(
4223 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4224
4225 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4226 SDValue RoundModeTimesNumBits =
4227 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4228
4229 NewMode =
4230 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4231
4232 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4233 // the table extracted bits into inline immediates.
4234 } else {
4235 // table_index = umin(value, value - 4)
4236 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4237 SDValue BitTable =
4239
4240 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4241 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4242 SDValue IndexVal =
4243 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4244
4245 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4246 SDValue RoundModeTimesNumBits =
4247 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4248
4249 SDValue TableValue =
4250 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4251 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4252
4253 // No need to mask out the high bits since the setreg will ignore them
4254 // anyway.
4255 NewMode = TruncTable;
4256 }
4257
4258 // Insert a readfirstlane in case the value is a VGPR. We could do this
4259 // earlier and keep more operations scalar, but that interferes with
4260 // combining the source.
4261 SDValue ReadFirstLaneID =
4262 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4263 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4264 ReadFirstLaneID, NewMode);
4265 }
4266
4267 // N.B. The setreg will be later folded into s_round_mode on supported
4268 // targets.
4269 SDValue IntrinID =
4270 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4271 uint32_t BothRoundHwReg =
4273 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4274
4275 SDValue SetReg =
4276 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4277 IntrinID, RoundBothImm, NewMode);
4278
4279 return SetReg;
4280}
4281
4283 if (Op->isDivergent())
4284 return SDValue();
4285
4286 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4291 break;
4292 default:
4293 return SDValue();
4294 }
4295
4296 return Op;
4297}
4298
4299// Work around DAG legality rules only based on the result type.
4301 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4302 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4303 EVT SrcVT = Src.getValueType();
4304
4305 if (SrcVT.getScalarType() != MVT::bf16)
4306 return Op;
4307
4308 SDLoc SL(Op);
4309 SDValue BitCast =
4310 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4311
4312 EVT DstVT = Op.getValueType();
4313 if (IsStrict)
4314 llvm_unreachable("Need STRICT_BF16_TO_FP");
4315
4316 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4317}
4318
4320 SDLoc SL(Op);
4321 if (Op.getValueType() != MVT::i64)
4322 return Op;
4323
4324 uint32_t ModeHwReg =
4326 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4327 uint32_t TrapHwReg =
4329 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4330
4331 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4332 SDValue IntrinID =
4333 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4334 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4335 Op.getOperand(0), IntrinID, ModeHwRegImm);
4336 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4337 Op.getOperand(0), IntrinID, TrapHwRegImm);
4338 SDValue TokenReg =
4339 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4340 GetTrapReg.getValue(1));
4341
4342 SDValue CvtPtr =
4343 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4344 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4345
4346 return DAG.getMergeValues({Result, TokenReg}, SL);
4347}
4348
4350 SDLoc SL(Op);
4351 if (Op.getOperand(1).getValueType() != MVT::i64)
4352 return Op;
4353
4354 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4355 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4356 DAG.getConstant(0, SL, MVT::i32));
4357 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4358 DAG.getConstant(1, SL, MVT::i32));
4359
4360 SDValue ReadFirstLaneID =
4361 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4362 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4363 ReadFirstLaneID, NewModeReg);
4364 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4365 ReadFirstLaneID, NewTrapReg);
4366
4367 unsigned ModeHwReg =
4369 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4370 unsigned TrapHwReg =
4372 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4373
4374 SDValue IntrinID =
4375 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4376 SDValue SetModeReg =
4377 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4378 IntrinID, ModeHwRegImm, NewModeReg);
4379 SDValue SetTrapReg =
4380 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4381 IntrinID, TrapHwRegImm, NewTrapReg);
4382 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4383}
4384
4386 const MachineFunction &MF) const {
4388 .Case("m0", AMDGPU::M0)
4389 .Case("exec", AMDGPU::EXEC)
4390 .Case("exec_lo", AMDGPU::EXEC_LO)
4391 .Case("exec_hi", AMDGPU::EXEC_HI)
4392 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4393 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4394 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4395 .Default(Register());
4396
4397 if (Reg == AMDGPU::NoRegister) {
4399 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4400 }
4401
4402 if (!Subtarget->hasFlatScrRegister() &&
4403 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4404 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4405 "\" for subtarget."));
4406 }
4407
4408 switch (Reg) {
4409 case AMDGPU::M0:
4410 case AMDGPU::EXEC_LO:
4411 case AMDGPU::EXEC_HI:
4412 case AMDGPU::FLAT_SCR_LO:
4413 case AMDGPU::FLAT_SCR_HI:
4414 if (VT.getSizeInBits() == 32)
4415 return Reg;
4416 break;
4417 case AMDGPU::EXEC:
4418 case AMDGPU::FLAT_SCR:
4419 if (VT.getSizeInBits() == 64)
4420 return Reg;
4421 break;
4422 default:
4423 llvm_unreachable("missing register type checking");
4424 }
4425
4427 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4428}
4429
4430// If kill is not the last instruction, split the block so kill is always a
4431// proper terminator.
4434 MachineBasicBlock *BB) const {
4435 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4437 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4438 return SplitBB;
4439}
4440
4441// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4442// \p MI will be the only instruction in the loop body block. Otherwise, it will
4443// be the first instruction in the remainder block.
4444//
4445/// \returns { LoopBody, Remainder }
4446static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4450
4451 // To insert the loop we need to split the block. Move everything after this
4452 // point to a new block, and insert a new empty block between the two.
4454 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4456 ++MBBI;
4457
4458 MF->insert(MBBI, LoopBB);
4459 MF->insert(MBBI, RemainderBB);
4460
4461 LoopBB->addSuccessor(LoopBB);
4462 LoopBB->addSuccessor(RemainderBB);
4463
4464 // Move the rest of the block into a new block.
4465 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4466
4467 if (InstInLoop) {
4468 auto Next = std::next(I);
4469
4470 // Move instruction to loop body.
4471 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4472
4473 // Move the rest of the block.
4474 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4475 } else {
4476 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4477 }
4478
4479 MBB.addSuccessor(LoopBB);
4480
4481 return std::pair(LoopBB, RemainderBB);
4482}
4483
4484/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4486 MachineBasicBlock *MBB = MI.getParent();
4488 auto I = MI.getIterator();
4489 auto E = std::next(I);
4490
4491 // clang-format off
4492 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4493 .addImm(0);
4494 // clang-format on
4495
4496 MIBundleBuilder Bundler(*MBB, I, E);
4497 finalizeBundle(*MBB, Bundler.begin());
4498}
4499
4502 MachineBasicBlock *BB) const {
4503 const DebugLoc &DL = MI.getDebugLoc();
4504
4506
4508
4509 // Apparently kill flags are only valid if the def is in the same block?
4510 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4511 Src->setIsKill(false);
4512
4513 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4514
4515 MachineBasicBlock::iterator I = LoopBB->end();
4516
4517 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4519
4520 // Clear TRAP_STS.MEM_VIOL
4521 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4522 .addImm(0)
4523 .addImm(EncodedReg);
4524
4526
4527 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4528
4529 // Load and check TRAP_STS.MEM_VIOL
4530 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4531 .addImm(EncodedReg);
4532
4533 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4534 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4535 .addReg(Reg, RegState::Kill)
4536 .addImm(0);
4537 // clang-format off
4538 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4539 .addMBB(LoopBB);
4540 // clang-format on
4541
4542 return RemainderBB;
4543}
4544
4545// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4546// wavefront. If the value is uniform and just happens to be in a VGPR, this
4547// will only do one iteration. In the worst case, this will loop 64 times.
4548//
4549// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4552 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4553 const DebugLoc &DL, const MachineOperand &Idx,
4554 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4555 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4556 Register &SGPRIdxReg) {
4557
4558 MachineFunction *MF = OrigBB.getParent();
4559 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4560 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4562
4563 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4564 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4565 Register NewExec = MRI.createVirtualRegister(BoolRC);
4566 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4567 Register CondReg = MRI.createVirtualRegister(BoolRC);
4568
4569 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4570 .addReg(InitReg)
4571 .addMBB(&OrigBB)
4572 .addReg(ResultReg)
4573 .addMBB(&LoopBB);
4574
4575 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4576 .addReg(InitSaveExecReg)
4577 .addMBB(&OrigBB)
4578 .addReg(NewExec)
4579 .addMBB(&LoopBB);
4580
4581 // Read the next variant <- also loop target.
4582 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4583 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4584
4585 // Compare the just read M0 value to all possible Idx values.
4586 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4587 .addReg(CurrentIdxReg)
4588 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4589
4590 // Update EXEC, save the original EXEC value to VCC.
4591 BuildMI(LoopBB, I, DL,
4592 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4593 : AMDGPU::S_AND_SAVEEXEC_B64),
4594 NewExec)
4595 .addReg(CondReg, RegState::Kill);
4596
4597 MRI.setSimpleHint(NewExec, CondReg);
4598
4599 if (UseGPRIdxMode) {
4600 if (Offset == 0) {
4601 SGPRIdxReg = CurrentIdxReg;
4602 } else {
4603 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4604 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4605 .addReg(CurrentIdxReg, RegState::Kill)
4606 .addImm(Offset);
4607 }
4608 } else {
4609 // Move index from VCC into M0
4610 if (Offset == 0) {
4611 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4612 .addReg(CurrentIdxReg, RegState::Kill);
4613 } else {
4614 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4615 .addReg(CurrentIdxReg, RegState::Kill)
4616 .addImm(Offset);
4617 }
4618 }
4619
4620 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4621 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4622 MachineInstr *InsertPt =
4623 BuildMI(LoopBB, I, DL,
4624 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4625 : AMDGPU::S_XOR_B64_term),
4626 Exec)
4627 .addReg(Exec)
4628 .addReg(NewExec);
4629
4630 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4631 // s_cbranch_scc0?
4632
4633 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4634 // clang-format off
4635 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4636 .addMBB(&LoopBB);
4637 // clang-format on
4638
4639 return InsertPt->getIterator();
4640}
4641
4642// This has slightly sub-optimal regalloc when the source vector is killed by
4643// the read. The register allocator does not understand that the kill is
4644// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4645// subregister from it, using 1 more VGPR than necessary. This was saved when
4646// this was expanded after register allocation.
4649 unsigned InitResultReg, unsigned PhiReg, int Offset,
4650 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4652 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4653 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4655 const DebugLoc &DL = MI.getDebugLoc();
4657
4658 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4659 Register DstReg = MI.getOperand(0).getReg();
4660 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4661 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4662 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4663 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4664
4665 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4666
4667 // Save the EXEC mask
4668 // clang-format off
4669 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4670 .addReg(Exec);
4671 // clang-format on
4672
4673 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4674
4675 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4676
4677 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4678 InitResultReg, DstReg, PhiReg, TmpExec,
4679 Offset, UseGPRIdxMode, SGPRIdxReg);
4680
4681 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4683 ++MBBI;
4684 MF->insert(MBBI, LandingPad);
4685 LoopBB->removeSuccessor(RemainderBB);
4686 LandingPad->addSuccessor(RemainderBB);
4687 LoopBB->addSuccessor(LandingPad);
4688 MachineBasicBlock::iterator First = LandingPad->begin();
4689 // clang-format off
4690 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4691 .addReg(SaveExec);
4692 // clang-format on
4693
4694 return InsPt;
4695}
4696
4697// Returns subreg index, offset
4698static std::pair<unsigned, int>
4700 const TargetRegisterClass *SuperRC, unsigned VecReg,
4701 int Offset) {
4702 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4703
4704 // Skip out of bounds offsets, or else we would end up using an undefined
4705 // register.
4706 if (Offset >= NumElts || Offset < 0)
4707 return std::pair(AMDGPU::sub0, Offset);
4708
4709 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4710}
4711
4714 int Offset) {
4715 MachineBasicBlock *MBB = MI.getParent();
4716 const DebugLoc &DL = MI.getDebugLoc();
4718
4719 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4720
4721 assert(Idx->getReg() != AMDGPU::NoRegister);
4722
4723 if (Offset == 0) {
4724 // clang-format off
4725 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4726 .add(*Idx);
4727 // clang-format on
4728 } else {
4729 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4730 .add(*Idx)
4731 .addImm(Offset);
4732 }
4733}
4734
4737 int Offset) {
4738 MachineBasicBlock *MBB = MI.getParent();
4739 const DebugLoc &DL = MI.getDebugLoc();
4741
4742 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4743
4744 if (Offset == 0)
4745 return Idx->getReg();
4746
4747 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4748 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4749 .add(*Idx)
4750 .addImm(Offset);
4751 return Tmp;
4752}
4753
4756 const GCNSubtarget &ST) {
4757 const SIInstrInfo *TII = ST.getInstrInfo();
4758 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4761
4762 Register Dst = MI.getOperand(0).getReg();
4763 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4764 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4765 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4766
4767 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4768 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4769
4770 unsigned SubReg;
4771 std::tie(SubReg, Offset) =
4772 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4773
4774 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4775
4776 // Check for a SGPR index.
4777 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4779 const DebugLoc &DL = MI.getDebugLoc();
4780
4781 if (UseGPRIdxMode) {
4782 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4783 // to avoid interfering with other uses, so probably requires a new
4784 // optimization pass.
4786
4787 const MCInstrDesc &GPRIDXDesc =
4788 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4789 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4790 .addReg(SrcReg)
4791 .addReg(Idx)
4792 .addImm(SubReg);
4793 } else {
4795
4796 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4797 .addReg(SrcReg, 0, SubReg)
4798 .addReg(SrcReg, RegState::Implicit);
4799 }
4800
4801 MI.eraseFromParent();
4802
4803 return &MBB;
4804 }
4805
4806 // Control flow needs to be inserted if indexing with a VGPR.
4807 const DebugLoc &DL = MI.getDebugLoc();
4809
4810 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4811 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4812
4813 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4814
4815 Register SGPRIdxReg;
4816 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4817 UseGPRIdxMode, SGPRIdxReg);
4818
4819 MachineBasicBlock *LoopBB = InsPt->getParent();
4820
4821 if (UseGPRIdxMode) {
4822 const MCInstrDesc &GPRIDXDesc =
4823 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4824
4825 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4826 .addReg(SrcReg)
4827 .addReg(SGPRIdxReg)
4828 .addImm(SubReg);
4829 } else {
4830 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4831 .addReg(SrcReg, 0, SubReg)
4832 .addReg(SrcReg, RegState::Implicit);
4833 }
4834
4835 MI.eraseFromParent();
4836
4837 return LoopBB;
4838}
4839
4842 const GCNSubtarget &ST) {
4843 const SIInstrInfo *TII = ST.getInstrInfo();
4844 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4847
4848 Register Dst = MI.getOperand(0).getReg();
4849 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4850 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4851 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4852 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4853 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4854 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4855
4856 // This can be an immediate, but will be folded later.
4857 assert(Val->getReg());
4858
4859 unsigned SubReg;
4860 std::tie(SubReg, Offset) =
4861 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4862 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4863
4864 if (Idx->getReg() == AMDGPU::NoRegister) {
4866 const DebugLoc &DL = MI.getDebugLoc();
4867
4868 assert(Offset == 0);
4869
4870 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4871 .add(*SrcVec)
4872 .add(*Val)
4873 .addImm(SubReg);
4874
4875 MI.eraseFromParent();
4876 return &MBB;
4877 }
4878
4879 // Check for a SGPR index.
4880 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4882 const DebugLoc &DL = MI.getDebugLoc();
4883
4884 if (UseGPRIdxMode) {
4886
4887 const MCInstrDesc &GPRIDXDesc =
4888 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4889 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4890 .addReg(SrcVec->getReg())
4891 .add(*Val)
4892 .addReg(Idx)
4893 .addImm(SubReg);
4894 } else {
4896
4897 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4898 TRI.getRegSizeInBits(*VecRC), 32, false);
4899 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4900 .addReg(SrcVec->getReg())
4901 .add(*Val)
4902 .addImm(SubReg);
4903 }
4904 MI.eraseFromParent();
4905 return &MBB;
4906 }
4907
4908 // Control flow needs to be inserted if indexing with a VGPR.
4909 if (Val->isReg())
4910 MRI.clearKillFlags(Val->getReg());
4911
4912 const DebugLoc &DL = MI.getDebugLoc();
4913
4914 Register PhiReg = MRI.createVirtualRegister(VecRC);
4915
4916 Register SGPRIdxReg;
4917 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4918 UseGPRIdxMode, SGPRIdxReg);
4919 MachineBasicBlock *LoopBB = InsPt->getParent();
4920
4921 if (UseGPRIdxMode) {
4922 const MCInstrDesc &GPRIDXDesc =
4923 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4924
4925 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4926 .addReg(PhiReg)
4927 .add(*Val)
4928 .addReg(SGPRIdxReg)
4929 .addImm(SubReg);
4930 } else {
4931 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4932 TRI.getRegSizeInBits(*VecRC), 32, false);
4933 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4934 .addReg(PhiReg)
4935 .add(*Val)
4936 .addImm(SubReg);
4937 }
4938
4939 MI.eraseFromParent();
4940 return LoopBB;
4941}
4942
4945 const GCNSubtarget &ST,
4946 unsigned Opc) {
4948 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4949 const DebugLoc &DL = MI.getDebugLoc();
4950 const SIInstrInfo *TII = ST.getInstrInfo();
4951
4952 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4953 Register SrcReg = MI.getOperand(1).getReg();
4954 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4955 Register DstReg = MI.getOperand(0).getReg();
4956 MachineBasicBlock *RetBB = nullptr;
4957 if (isSGPR) {
4958 // These operations with a uniform value i.e. SGPR are idempotent.
4959 // Reduced value will be same as given sgpr.
4960 // clang-format off
4961 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4962 .addReg(SrcReg);
4963 // clang-format on
4964 RetBB = &BB;
4965 } else {
4966 // TODO: Implement DPP Strategy and switch based on immediate strategy
4967 // operand. For now, for all the cases (default, Iterative and DPP we use
4968 // iterative approach by default.)
4969
4970 // To reduce the VGPR using iterative approach, we need to iterate
4971 // over all the active lanes. Lowering consists of ComputeLoop,
4972 // which iterate over only active lanes. We use copy of EXEC register
4973 // as induction variable and every active lane modifies it using bitset0
4974 // so that we will get the next active lane for next iteration.
4976 Register SrcReg = MI.getOperand(1).getReg();
4977
4978 // Create Control flow for loop
4979 // Split MI's Machine Basic block into For loop
4980 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4981
4982 // Create virtual registers required for lowering.
4983 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4984 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4985 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4986 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4987
4988 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4989 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4990 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4991
4992 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4993 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4994
4995 bool IsWave32 = ST.isWave32();
4996 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4997 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4998
4999 // Create initail values of induction variable from Exec, Accumulator and
5000 // insert branch instr to newly created ComputeBlockk
5001 uint32_t InitalValue =
5002 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5003 auto TmpSReg =
5004 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5005 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5006 .addImm(InitalValue);
5007 // clang-format off
5008 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5009 .addMBB(ComputeLoop);
5010 // clang-format on
5011
5012 // Start constructing ComputeLoop
5013 I = ComputeLoop->end();
5014 auto Accumulator =
5015 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5016 .addReg(InitalValReg)
5017 .addMBB(&BB);
5018 auto ActiveBits =
5019 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5020 .addReg(TmpSReg->getOperand(0).getReg())
5021 .addMBB(&BB);
5022
5023 // Perform the computations
5024 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5025 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5026 .addReg(ActiveBits->getOperand(0).getReg());
5027 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5028 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5029 .addReg(SrcReg)
5030 .addReg(FF1->getOperand(0).getReg());
5031 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5032 .addReg(Accumulator->getOperand(0).getReg())
5033 .addReg(LaneValue->getOperand(0).getReg());
5034
5035 // Manipulate the iterator to get the next active lane
5036 unsigned BITSETOpc =
5037 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5038 auto NewActiveBits =
5039 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5040 .addReg(FF1->getOperand(0).getReg())
5041 .addReg(ActiveBits->getOperand(0).getReg());
5042
5043 // Add phi nodes
5044 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5045 .addMBB(ComputeLoop);
5046 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5047 .addMBB(ComputeLoop);
5048
5049 // Creating branching
5050 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5051 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5052 .addReg(NewActiveBits->getOperand(0).getReg())
5053 .addImm(0);
5054 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5055 .addMBB(ComputeLoop);
5056
5057 RetBB = ComputeEnd;
5058 }
5059 MI.eraseFromParent();
5060 return RetBB;
5061}
5062
5065 MachineBasicBlock *BB) const {
5066
5068 MachineFunction *MF = BB->getParent();
5070
5071 switch (MI.getOpcode()) {
5072 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5073 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5074 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5075 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5076 case AMDGPU::S_UADDO_PSEUDO:
5077 case AMDGPU::S_USUBO_PSEUDO: {
5078 const DebugLoc &DL = MI.getDebugLoc();
5079 MachineOperand &Dest0 = MI.getOperand(0);
5080 MachineOperand &Dest1 = MI.getOperand(1);
5081 MachineOperand &Src0 = MI.getOperand(2);
5082 MachineOperand &Src1 = MI.getOperand(3);
5083
5084 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5085 ? AMDGPU::S_ADD_I32
5086 : AMDGPU::S_SUB_I32;
5087 // clang-format off
5088 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5089 .add(Src0)
5090 .add(Src1);
5091 // clang-format on
5092
5093 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5094 .addImm(1)
5095 .addImm(0);
5096
5097 MI.eraseFromParent();
5098 return BB;
5099 }
5100 case AMDGPU::S_ADD_U64_PSEUDO:
5101 case AMDGPU::S_SUB_U64_PSEUDO: {
5102 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5103 // For GFX12, we emit s_add_u64 and s_sub_u64.
5104 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5106 const DebugLoc &DL = MI.getDebugLoc();
5107 MachineOperand &Dest = MI.getOperand(0);
5108 MachineOperand &Src0 = MI.getOperand(1);
5109 MachineOperand &Src1 = MI.getOperand(2);
5110 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5111 if (Subtarget->hasScalarAddSub64()) {
5112 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5113 // clang-format off
5114 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5115 .add(Src0)
5116 .add(Src1);
5117 // clang-format on
5118 } else {
5119 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5120 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5121
5122 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5123 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5124
5125 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5126 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5127 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5128 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5129
5130 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5131 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5132 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5133 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5134
5135 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5136 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5137 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5138 .add(Src0Sub0)
5139 .add(Src1Sub0);
5140 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5141 .add(Src0Sub1)
5142 .add(Src1Sub1);
5143 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5144 .addReg(DestSub0)
5145 .addImm(AMDGPU::sub0)
5146 .addReg(DestSub1)
5147 .addImm(AMDGPU::sub1);
5148 }
5149 MI.eraseFromParent();
5150 return BB;
5151 }
5152 case AMDGPU::V_ADD_U64_PSEUDO:
5153 case AMDGPU::V_SUB_U64_PSEUDO: {
5155 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5156 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5157 const DebugLoc &DL = MI.getDebugLoc();
5158
5159 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5160
5161 MachineOperand &Dest = MI.getOperand(0);
5162 MachineOperand &Src0 = MI.getOperand(1);
5163 MachineOperand &Src1 = MI.getOperand(2);
5164
5165 if (IsAdd && ST.hasLshlAddB64()) {
5166 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5167 Dest.getReg())
5168 .add(Src0)
5169 .addImm(0)
5170 .add(Src1);
5171 TII->legalizeOperands(*Add);
5172 MI.eraseFromParent();
5173 return BB;
5174 }
5175
5176 const auto *CarryRC = TRI->getWaveMaskRegClass();
5177
5178 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5179 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5180
5181 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5182 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5183
5184 const TargetRegisterClass *Src0RC = Src0.isReg()
5185 ? MRI.getRegClass(Src0.getReg())
5186 : &AMDGPU::VReg_64RegClass;
5187 const TargetRegisterClass *Src1RC = Src1.isReg()
5188 ? MRI.getRegClass(Src1.getReg())
5189 : &AMDGPU::VReg_64RegClass;
5190
5191 const TargetRegisterClass *Src0SubRC =
5192 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5193 const TargetRegisterClass *Src1SubRC =
5194 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5195
5196 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5197 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5198 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5199 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5200
5201 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5202 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5203 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5204 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5205
5206 unsigned LoOpc =
5207 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5208 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5209 .addReg(CarryReg, RegState::Define)
5210 .add(SrcReg0Sub0)
5211 .add(SrcReg1Sub0)
5212 .addImm(0); // clamp bit
5213
5214 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5215 MachineInstr *HiHalf =
5216 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5217 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5218 .add(SrcReg0Sub1)
5219 .add(SrcReg1Sub1)
5220 .addReg(CarryReg, RegState::Kill)
5221 .addImm(0); // clamp bit
5222
5223 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5224 .addReg(DestSub0)
5225 .addImm(AMDGPU::sub0)
5226 .addReg(DestSub1)
5227 .addImm(AMDGPU::sub1);
5228 TII->legalizeOperands(*LoHalf);
5229 TII->legalizeOperands(*HiHalf);
5230 MI.eraseFromParent();
5231 return BB;
5232 }
5233 case AMDGPU::S_ADD_CO_PSEUDO:
5234 case AMDGPU::S_SUB_CO_PSEUDO: {
5235 // This pseudo has a chance to be selected
5236 // only from uniform add/subcarry node. All the VGPR operands
5237 // therefore assumed to be splat vectors.
5239 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5240 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5242 const DebugLoc &DL = MI.getDebugLoc();
5243 MachineOperand &Dest = MI.getOperand(0);
5244 MachineOperand &CarryDest = MI.getOperand(1);
5245 MachineOperand &Src0 = MI.getOperand(2);
5246 MachineOperand &Src1 = MI.getOperand(3);
5247 MachineOperand &Src2 = MI.getOperand(4);
5248 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5249 ? AMDGPU::S_ADDC_U32
5250 : AMDGPU::S_SUBB_U32;
5251 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5252 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5253 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5254 .addReg(Src0.getReg());
5255 Src0.setReg(RegOp0);
5256 }
5257 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5258 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5259 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5260 .addReg(Src1.getReg());
5261 Src1.setReg(RegOp1);
5262 }
5263 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5264 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5265 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5266 .addReg(Src2.getReg());
5267 Src2.setReg(RegOp2);
5268 }
5269
5270 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5271 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5272 assert(WaveSize == 64 || WaveSize == 32);
5273
5274 if (WaveSize == 64) {
5275 if (ST.hasScalarCompareEq64()) {
5276 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5277 .addReg(Src2.getReg())
5278 .addImm(0);
5279 } else {
5280 const TargetRegisterClass *SubRC =
5281 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5282 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5283 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5284 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5285 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5286 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5287
5288 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5289 .add(Src2Sub0)
5290 .add(Src2Sub1);
5291
5292 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5293 .addReg(Src2_32, RegState::Kill)
5294 .addImm(0);
5295 }
5296 } else {
5297 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5298 .addReg(Src2.getReg())
5299 .addImm(0);
5300 }
5301
5302 // clang-format off
5303 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5304 .add(Src0)
5305 .add(Src1);
5306 // clang-format on
5307
5308 unsigned SelOpc =
5309 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5310
5311 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5312 .addImm(-1)
5313 .addImm(0);
5314
5315 MI.eraseFromParent();
5316 return BB;
5317 }
5318 case AMDGPU::SI_INIT_M0: {
5319 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5320 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5321 .add(MI.getOperand(0));
5322 MI.eraseFromParent();
5323 return BB;
5324 }
5325 case AMDGPU::GET_GROUPSTATICSIZE: {
5326 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5327 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5328 DebugLoc DL = MI.getDebugLoc();
5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5330 .add(MI.getOperand(0))
5331 .addImm(MFI->getLDSSize());
5332 MI.eraseFromParent();
5333 return BB;
5334 }
5335 case AMDGPU::GET_SHADERCYCLESHILO: {
5338 const DebugLoc &DL = MI.getDebugLoc();
5339 // The algorithm is:
5340 //
5341 // hi1 = getreg(SHADER_CYCLES_HI)
5342 // lo1 = getreg(SHADER_CYCLES_LO)
5343 // hi2 = getreg(SHADER_CYCLES_HI)
5344 //
5345 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5346 // Otherwise there was overflow and the result is hi2:0. In both cases the
5347 // result should represent the actual time at some point during the sequence
5348 // of three getregs.
5349 using namespace AMDGPU::Hwreg;
5350 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5352 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5353 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5354 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5355 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5356 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5357 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5358 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5359 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5360 .addReg(RegHi1)
5361 .addReg(RegHi2);
5362 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5364 .addReg(RegLo1)
5365 .addImm(0);
5366 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5367 .add(MI.getOperand(0))
5368 .addReg(RegLo)
5369 .addImm(AMDGPU::sub0)
5370 .addReg(RegHi2)
5371 .addImm(AMDGPU::sub1);
5372 MI.eraseFromParent();
5373 return BB;
5374 }
5375 case AMDGPU::SI_INDIRECT_SRC_V1:
5376 case AMDGPU::SI_INDIRECT_SRC_V2:
5377 case AMDGPU::SI_INDIRECT_SRC_V4:
5378 case AMDGPU::SI_INDIRECT_SRC_V8:
5379 case AMDGPU::SI_INDIRECT_SRC_V9:
5380 case AMDGPU::SI_INDIRECT_SRC_V10:
5381 case AMDGPU::SI_INDIRECT_SRC_V11:
5382 case AMDGPU::SI_INDIRECT_SRC_V12:
5383 case AMDGPU::SI_INDIRECT_SRC_V16:
5384 case AMDGPU::SI_INDIRECT_SRC_V32:
5385 return emitIndirectSrc(MI, *BB, *getSubtarget());
5386 case AMDGPU::SI_INDIRECT_DST_V1:
5387 case AMDGPU::SI_INDIRECT_DST_V2:
5388 case AMDGPU::SI_INDIRECT_DST_V4:
5389 case AMDGPU::SI_INDIRECT_DST_V8:
5390 case AMDGPU::SI_INDIRECT_DST_V9:
5391 case AMDGPU::SI_INDIRECT_DST_V10:
5392 case AMDGPU::SI_INDIRECT_DST_V11:
5393 case AMDGPU::SI_INDIRECT_DST_V12:
5394 case AMDGPU::SI_INDIRECT_DST_V16:
5395 case AMDGPU::SI_INDIRECT_DST_V32:
5396 return emitIndirectDst(MI, *BB, *getSubtarget());
5397 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5398 case AMDGPU::SI_KILL_I1_PSEUDO:
5399 return splitKillBlock(MI, BB);
5400 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5402 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5403 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5404
5405 Register Dst = MI.getOperand(0).getReg();
5406 const MachineOperand &Src0 = MI.getOperand(1);
5407 const MachineOperand &Src1 = MI.getOperand(2);
5408 const DebugLoc &DL = MI.getDebugLoc();
5409 Register SrcCond = MI.getOperand(3).getReg();
5410
5411 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5412 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5413 const auto *CondRC = TRI->getWaveMaskRegClass();
5414 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5415
5416 const TargetRegisterClass *Src0RC = Src0.isReg()
5417 ? MRI.getRegClass(Src0.getReg())
5418 : &AMDGPU::VReg_64RegClass;
5419 const TargetRegisterClass *Src1RC = Src1.isReg()
5420 ? MRI.getRegClass(Src1.getReg())
5421 : &AMDGPU::VReg_64RegClass;
5422
5423 const TargetRegisterClass *Src0SubRC =
5424 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5425 const TargetRegisterClass *Src1SubRC =
5426 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5427
5428 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5429 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5430 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5431 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5432
5433 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5434 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5435 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5436 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5437
5438 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5439 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5440 .addImm(0)
5441 .add(Src0Sub0)
5442 .addImm(0)
5443 .add(Src1Sub0)
5444 .addReg(SrcCondCopy);
5445 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5446 .addImm(0)
5447 .add(Src0Sub1)
5448 .addImm(0)
5449 .add(Src1Sub1)
5450 .addReg(SrcCondCopy);
5451
5452 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5453 .addReg(DstLo)
5454 .addImm(AMDGPU::sub0)
5455 .addReg(DstHi)
5456 .addImm(AMDGPU::sub1);
5457 MI.eraseFromParent();
5458 return BB;
5459 }
5460 case AMDGPU::SI_BR_UNDEF: {
5462 const DebugLoc &DL = MI.getDebugLoc();
5463 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5464 .add(MI.getOperand(0));
5465 Br->getOperand(1).setIsUndef(); // read undef SCC
5466 MI.eraseFromParent();
5467 return BB;
5468 }
5469 case AMDGPU::ADJCALLSTACKUP:
5470 case AMDGPU::ADJCALLSTACKDOWN: {
5472 MachineInstrBuilder MIB(*MF, &MI);
5473 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5474 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5475 return BB;
5476 }
5477 case AMDGPU::SI_CALL_ISEL: {
5479 const DebugLoc &DL = MI.getDebugLoc();
5480
5481 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5482
5484 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5485
5486 for (const MachineOperand &MO : MI.operands())
5487 MIB.add(MO);
5488
5489 MIB.cloneMemRefs(MI);
5490 MI.eraseFromParent();
5491 return BB;
5492 }
5493 case AMDGPU::V_ADD_CO_U32_e32:
5494 case AMDGPU::V_SUB_CO_U32_e32:
5495 case AMDGPU::V_SUBREV_CO_U32_e32: {
5496 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5497 const DebugLoc &DL = MI.getDebugLoc();
5498 unsigned Opc = MI.getOpcode();
5499
5500 bool NeedClampOperand = false;
5501 if (TII->pseudoToMCOpcode(Opc) == -1) {
5502 Opc = AMDGPU::getVOPe64(Opc);
5503 NeedClampOperand = true;
5504 }
5505
5506 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5507 if (TII->isVOP3(*I)) {
5508 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5509 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5510 I.addReg(TRI->getVCC(), RegState::Define);
5511 }
5512 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5513 if (NeedClampOperand)
5514 I.addImm(0); // clamp bit for e64 encoding
5515
5516 TII->legalizeOperands(*I);
5517
5518 MI.eraseFromParent();
5519 return BB;
5520 }
5521 case AMDGPU::V_ADDC_U32_e32:
5522 case AMDGPU::V_SUBB_U32_e32:
5523 case AMDGPU::V_SUBBREV_U32_e32:
5524 // These instructions have an implicit use of vcc which counts towards the
5525 // constant bus limit.
5526 TII->legalizeOperands(MI);
5527 return BB;
5528 case AMDGPU::DS_GWS_INIT:
5529 case AMDGPU::DS_GWS_SEMA_BR:
5530 case AMDGPU::DS_GWS_BARRIER:
5531 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5532 [[fallthrough]];
5533 case AMDGPU::DS_GWS_SEMA_V:
5534 case AMDGPU::DS_GWS_SEMA_P:
5535 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5536 // A s_waitcnt 0 is required to be the instruction immediately following.
5537 if (getSubtarget()->hasGWSAutoReplay()) {
5539 return BB;
5540 }
5541
5542 return emitGWSMemViolTestLoop(MI, BB);
5543 case AMDGPU::S_SETREG_B32: {
5544 // Try to optimize cases that only set the denormal mode or rounding mode.
5545 //
5546 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5547 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5548 // instead.
5549 //
5550 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5551 // allow you to have a no side effect instruction in the output of a
5552 // sideeffecting pattern.
5553 auto [ID, Offset, Width] =
5554 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5556 return BB;
5557
5558 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5559 const unsigned SetMask = WidthMask << Offset;
5560
5561 if (getSubtarget()->hasDenormModeInst()) {
5562 unsigned SetDenormOp = 0;
5563 unsigned SetRoundOp = 0;
5564
5565 // The dedicated instructions can only set the whole denorm or round mode
5566 // at once, not a subset of bits in either.
5567 if (SetMask ==
5569 // If this fully sets both the round and denorm mode, emit the two
5570 // dedicated instructions for these.
5571 SetRoundOp = AMDGPU::S_ROUND_MODE;
5572 SetDenormOp = AMDGPU::S_DENORM_MODE;
5573 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5574 SetRoundOp = AMDGPU::S_ROUND_MODE;
5575 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5576 SetDenormOp = AMDGPU::S_DENORM_MODE;
5577 }
5578
5579 if (SetRoundOp || SetDenormOp) {
5581 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5582 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5583 unsigned ImmVal = Def->getOperand(1).getImm();
5584 if (SetRoundOp) {
5585 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5586 .addImm(ImmVal & 0xf);
5587
5588 // If we also have the denorm mode, get just the denorm mode bits.
5589 ImmVal >>= 4;
5590 }
5591
5592 if (SetDenormOp) {
5593 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5594 .addImm(ImmVal & 0xf);
5595 }
5596
5597 MI.eraseFromParent();
5598 return BB;
5599 }
5600 }
5601 }
5602
5603 // If only FP bits are touched, used the no side effects pseudo.
5604 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5605 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5606 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5607
5608 return BB;
5609 }
5610 case AMDGPU::S_INVERSE_BALLOT_U32:
5611 case AMDGPU::S_INVERSE_BALLOT_U64:
5612 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5613 // necessary. After that they are equivalent to a COPY.
5614 MI.setDesc(TII->get(AMDGPU::COPY));
5615 return BB;
5616 case AMDGPU::ENDPGM_TRAP: {
5617 const DebugLoc &DL = MI.getDebugLoc();
5618 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5619 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5620 MI.addOperand(MachineOperand::CreateImm(0));
5621 return BB;
5622 }
5623
5624 // We need a block split to make the real endpgm a terminator. We also don't
5625 // want to break phis in successor blocks, so we can't just delete to the
5626 // end of the block.
5627
5628 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5630 MF->push_back(TrapBB);
5631 // clang-format off
5632 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5633 .addImm(0);
5634 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5635 .addMBB(TrapBB);
5636 // clang-format on
5637
5638 BB->addSuccessor(TrapBB);
5639 MI.eraseFromParent();
5640 return SplitBB;
5641 }
5642 case AMDGPU::SIMULATED_TRAP: {
5643 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5645 MachineBasicBlock *SplitBB =
5646 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5647 MI.eraseFromParent();
5648 return SplitBB;
5649 }
5650 default:
5651 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5652 if (!MI.mayStore())
5654 return BB;
5655 }
5657 }
5658}
5659
5661 // This currently forces unfolding various combinations of fsub into fma with
5662 // free fneg'd operands. As long as we have fast FMA (controlled by
5663 // isFMAFasterThanFMulAndFAdd), we should perform these.
5664
5665 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5666 // most of these combines appear to be cycle neutral but save on instruction
5667 // count / code size.
5668 return true;
5669}
5670
5672
5674 EVT VT) const {
5675 if (!VT.isVector()) {
5676 return MVT::i1;
5677 }
5678 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5679}
5680
5682 // TODO: Should i16 be used always if legal? For now it would force VALU
5683 // shifts.
5684 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5685}
5686
5688 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5689 ? Ty.changeElementSize(16)
5690 : Ty.changeElementSize(32);
5691}
5692
5693// Answering this is somewhat tricky and depends on the specific device which
5694// have different rates for fma or all f64 operations.
5695//
5696// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5697// regardless of which device (although the number of cycles differs between
5698// devices), so it is always profitable for f64.
5699//
5700// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5701// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5702// which we can always do even without fused FP ops since it returns the same
5703// result as the separate operations and since it is always full
5704// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5705// however does not support denormals, so we do report fma as faster if we have
5706// a fast fma device and require denormals.
5707//
5709 EVT VT) const {
5710 VT = VT.getScalarType();
5711
5712 switch (VT.getSimpleVT().SimpleTy) {
5713 case MVT::f32: {
5714 // If mad is not available this depends only on if f32 fma is full rate.
5715 if (!Subtarget->hasMadMacF32Insts())
5716 return Subtarget->hasFastFMAF32();
5717
5718 // Otherwise f32 mad is always full rate and returns the same result as
5719 // the separate operations so should be preferred over fma.
5720 // However does not support denormals.
5722 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5723
5724 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5725 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5726 }
5727 case MVT::f64:
5728 return true;
5729 case MVT::f16:
5730 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5731 default:
5732 break;
5733 }
5734
5735 return false;
5736}
5737
5739 LLT Ty) const {
5740 switch (Ty.getScalarSizeInBits()) {
5741 case 16:
5742 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5743 case 32:
5744 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5745 case 64:
5746 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5747 default:
5748 break;
5749 }
5750
5751 return false;
5752}
5753
5754// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5755// specific details.
5757 Type *Ty) const {
5758 switch (Ty->getScalarSizeInBits()) {
5759 case 16: {
5761 return Subtarget->has16BitInsts() &&
5762 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5763 }
5764 case 32: {
5765 if (!Subtarget->hasMadMacF32Insts())
5766 return Subtarget->hasFastFMAF32();
5767
5769 if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5770 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5771
5772 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5773 }
5774 case 64:
5775 return true;
5776 default:
5777 break;
5778 }
5779
5780 return false;
5781}
5782
5784 if (!Ty.isScalar())
5785 return false;
5786
5787 if (Ty.getScalarSizeInBits() == 16)
5788 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5789 if (Ty.getScalarSizeInBits() == 32)
5790 return Subtarget->hasMadMacF32Insts() &&
5791 denormalModeIsFlushAllF32(*MI.getMF());
5792
5793 return false;
5794}
5795
5797 const SDNode *N) const {
5798 // TODO: Check future ftz flag
5799 // v_mad_f32/v_mac_f32 do not support denormals.
5800 EVT VT = N->getValueType(0);
5801 if (VT == MVT::f32)
5802 return Subtarget->hasMadMacF32Insts() &&
5804 if (VT == MVT::f16) {
5805 return Subtarget->hasMadF16() &&
5807 }
5808
5809 return false;
5810}
5811
5812//===----------------------------------------------------------------------===//
5813// Custom DAG Lowering Operations
5814//===----------------------------------------------------------------------===//
5815
5816// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5817// wider vector type is legal.
5819 SelectionDAG &DAG) const {
5820 unsigned Opc = Op.getOpcode();
5821 EVT VT = Op.getValueType();
5822 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5823 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5824 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5825 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5826
5827 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5828
5829 SDLoc SL(Op);
5830 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5831 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5832
5833 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5834}
5835
5836// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5837// wider vector type is legal.
5839 SelectionDAG &DAG) const {
5840 unsigned Opc = Op.getOpcode();
5841 EVT VT = Op.getValueType();
5842 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5843 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5844 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5845 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5846
5847 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5848 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5849
5850 SDLoc SL(Op);
5851
5852 SDValue OpLo =
5853 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5854 SDValue OpHi =
5855 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5856
5857 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5858}
5859
5861 SelectionDAG &DAG) const {
5862 unsigned Opc = Op.getOpcode();
5863 EVT VT = Op.getValueType();
5864 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5865 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5866 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5867 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5868 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5869 VT == MVT::v32bf16);
5870
5871 SDValue Op0 = Op.getOperand(0);
5872 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5873 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5874 : std::pair(Op0, Op0);
5875
5876 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5877 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5878
5879 SDLoc SL(Op);
5880 auto ResVT = DAG.GetSplitDestVTs(VT);
5881
5882 SDValue OpLo =
5883 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5884 SDValue OpHi =
5885 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5886
5887 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5888}
5889
5891 switch (Op.getOpcode()) {
5892 default:
5894 case ISD::BRCOND:
5895 return LowerBRCOND(Op, DAG);
5896 case ISD::RETURNADDR:
5897 return LowerRETURNADDR(Op, DAG);
5898 case ISD::LOAD: {
5899 SDValue Result = LowerLOAD(Op, DAG);
5900 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5901 "Load should return a value and a chain");
5902 return Result;
5903 }
5904 case ISD::FSQRT: {
5905 EVT VT = Op.getValueType();
5906 if (VT == MVT::f32)
5907 return lowerFSQRTF32(Op, DAG);
5908 if (VT == MVT::f64)
5909 return lowerFSQRTF64(Op, DAG);
5910 return SDValue();
5911 }
5912 case ISD::FSIN:
5913 case ISD::FCOS:
5914 return LowerTrig(Op, DAG);
5915 case ISD::SELECT:
5916 return LowerSELECT(Op, DAG);
5917 case ISD::FDIV:
5918 return LowerFDIV(Op, DAG);
5919 case ISD::FFREXP:
5920 return LowerFFREXP(Op, DAG);
5922 return LowerATOMIC_CMP_SWAP(Op, DAG);
5923 case ISD::STORE:
5924 return LowerSTORE(Op, DAG);
5925 case ISD::GlobalAddress: {
5928 return LowerGlobalAddress(MFI, Op, DAG);
5929 }
5931 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5933 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5935 return LowerINTRINSIC_VOID(Op, DAG);
5936 case ISD::ADDRSPACECAST:
5937 return lowerADDRSPACECAST(Op, DAG);
5939 return lowerINSERT_SUBVECTOR(Op, DAG);
5941 return lowerINSERT_VECTOR_ELT(Op, DAG);
5943 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5945 return lowerVECTOR_SHUFFLE(Op, DAG);
5947 return lowerSCALAR_TO_VECTOR(Op, DAG);
5948 case ISD::BUILD_VECTOR:
5949 return lowerBUILD_VECTOR(Op, DAG);
5950 case ISD::FP_ROUND:
5952 return lowerFP_ROUND(Op, DAG);
5953 case ISD::TRAP:
5954 return lowerTRAP(Op, DAG);
5955 case ISD::DEBUGTRAP:
5956 return lowerDEBUGTRAP(Op, DAG);
5957 case ISD::ABS:
5958 case ISD::FABS:
5959 case ISD::FNEG:
5960 case ISD::FCANONICALIZE:
5961 case ISD::BSWAP:
5962 return splitUnaryVectorOp(Op, DAG);
5963 case ISD::FMINNUM:
5964 case ISD::FMAXNUM:
5965 return lowerFMINNUM_FMAXNUM(Op, DAG);
5966 case ISD::FLDEXP:
5967 case ISD::STRICT_FLDEXP:
5968 return lowerFLDEXP(Op, DAG);
5969 case ISD::FMA:
5970 return splitTernaryVectorOp(Op, DAG);
5971 case ISD::FP_TO_SINT:
5972 case ISD::FP_TO_UINT:
5973 return LowerFP_TO_INT(Op, DAG);
5974 case ISD::SHL:
5975 case ISD::SRA:
5976 case ISD::SRL:
5977 case ISD::ADD:
5978 case ISD::SUB:
5979 case ISD::SMIN:
5980 case ISD::SMAX:
5981 case ISD::UMIN:
5982 case ISD::UMAX:
5983 case ISD::FADD:
5984 case ISD::FMUL:
5985 case ISD::FMINNUM_IEEE:
5986 case ISD::FMAXNUM_IEEE:
5987 case ISD::FMINIMUM:
5988 case ISD::FMAXIMUM:
5989 case ISD::FMINIMUMNUM:
5990 case ISD::FMAXIMUMNUM:
5991 case ISD::UADDSAT:
5992 case ISD::USUBSAT:
5993 case ISD::SADDSAT:
5994 case ISD::SSUBSAT:
5995 return splitBinaryVectorOp(Op, DAG);
5996 case ISD::MUL:
5997 return lowerMUL(Op, DAG);
5998 case ISD::SMULO:
5999 case ISD::UMULO:
6000 return lowerXMULO(Op, DAG);
6001 case ISD::SMUL_LOHI:
6002 case ISD::UMUL_LOHI:
6003 return lowerXMUL_LOHI(Op, DAG);
6005 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6006 case ISD::STACKSAVE:
6007 return LowerSTACKSAVE(Op, DAG);
6008 case ISD::GET_ROUNDING:
6009 return lowerGET_ROUNDING(Op, DAG);
6010 case ISD::SET_ROUNDING:
6011 return lowerSET_ROUNDING(Op, DAG);
6012 case ISD::PREFETCH:
6013 return lowerPREFETCH(Op, DAG);
6014 case ISD::FP_EXTEND:
6016 return lowerFP_EXTEND(Op, DAG);
6017 case ISD::GET_FPENV:
6018 return lowerGET_FPENV(Op, DAG);
6019 case ISD::SET_FPENV:
6020 return lowerSET_FPENV(Op, DAG);
6021 }
6022 return SDValue();
6023}
6024
6025// Used for D16: Casts the result of an instruction into the right vector,
6026// packs values if loads return unpacked values.
6028 const SDLoc &DL, SelectionDAG &DAG,
6029 bool Unpacked) {
6030 if (!LoadVT.isVector())
6031 return Result;
6032
6033 // Cast back to the original packed type or to a larger type that is a
6034 // multiple of 32 bit for D16. Widening the return type is a required for
6035 // legalization.
6036 EVT FittingLoadVT = LoadVT;
6037 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6038 FittingLoadVT =
6040 LoadVT.getVectorNumElements() + 1);
6041 }
6042
6043 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6044 // Truncate to v2i16/v4i16.
6045 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6046
6047 // Workaround legalizer not scalarizing truncate after vector op
6048 // legalization but not creating intermediate vector trunc.
6050 DAG.ExtractVectorElements(Result, Elts);
6051 for (SDValue &Elt : Elts)
6052 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6053
6054 // Pad illegal v1i16/v3fi6 to v4i16
6055 if ((LoadVT.getVectorNumElements() % 2) == 1)
6056 Elts.push_back(DAG.getUNDEF(MVT::i16));
6057
6058 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6059
6060 // Bitcast to original type (v2f16/v4f16).
6061 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6062 }
6063
6064 // Cast back to the original packed type.
6065 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6066}
6067
6068SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6069 SelectionDAG &DAG,
6071 bool IsIntrinsic) const {
6072 SDLoc DL(M);
6073
6074 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6075 EVT LoadVT = M->getValueType(0);
6076
6077 EVT EquivLoadVT = LoadVT;
6078 if (LoadVT.isVector()) {
6079 if (Unpacked) {
6080 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6081 LoadVT.getVectorNumElements());
6082 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6083 // Widen v3f16 to legal type
6084 EquivLoadVT =
6086 LoadVT.getVectorNumElements() + 1);
6087 }
6088 }
6089
6090 // Change from v4f16/v2f16 to EquivLoadVT.
6091 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6092
6094 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6095 M->getMemoryVT(), M->getMemOperand());
6096
6097 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6098
6099 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6100}
6101
6102SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6103 SelectionDAG &DAG,
6104 ArrayRef<SDValue> Ops) const {
6105 SDLoc DL(M);
6106 EVT LoadVT = M->getValueType(0);
6107 EVT EltType = LoadVT.getScalarType();
6108 EVT IntVT = LoadVT.changeTypeToInteger();
6109
6110 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6111
6112 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6113 bool IsTFE = M->getNumValues() == 3;
6114
6115 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6119
6120 if (IsD16) {
6121 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6122 }
6123
6124 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6125 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6126 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6127 IsTFE);
6128
6129 if (isTypeLegal(LoadVT)) {
6130 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6131 M->getMemOperand(), DAG);
6132 }
6133
6134 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6135 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6136 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6137 M->getMemOperand(), DAG);
6138 return DAG.getMergeValues(
6139 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6140 DL);
6141}
6142
6144 SelectionDAG &DAG) {
6145 EVT VT = N->getValueType(0);
6146 unsigned CondCode = N->getConstantOperandVal(3);
6147 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6148 return DAG.getUNDEF(VT);
6149
6150 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6151
6152 SDValue LHS = N->getOperand(1);
6153 SDValue RHS = N->getOperand(2);
6154
6155 SDLoc DL(N);
6156
6157 EVT CmpVT = LHS.getValueType();
6158 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6159 unsigned PromoteOp =
6161 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6162 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6163 }
6164
6165 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6166
6167 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6168 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6169
6170 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6171 DAG.getCondCode(CCOpcode));
6172 if (VT.bitsEq(CCVT))
6173 return SetCC;
6174 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6175}
6176
6178 SelectionDAG &DAG) {
6179 EVT VT = N->getValueType(0);
6180
6181 unsigned CondCode = N->getConstantOperandVal(3);
6182 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6183 return DAG.getUNDEF(VT);
6184
6185 SDValue Src0 = N->getOperand(1);
6186 SDValue Src1 = N->getOperand(2);
6187 EVT CmpVT = Src0.getValueType();
6188 SDLoc SL(N);
6189
6190 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6191 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6192 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6193 }
6194
6195 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6196 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6197 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6198 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6199 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6200 DAG.getCondCode(CCOpcode));
6201 if (VT.bitsEq(CCVT))
6202 return SetCC;
6203 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6204}
6205
6207 SelectionDAG &DAG) {
6208 EVT VT = N->getValueType(0);
6209 SDValue Src = N->getOperand(1);
6210 SDLoc SL(N);
6211
6212 if (Src.getOpcode() == ISD::SETCC) {
6213 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6214 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6215 Src.getOperand(1), Src.getOperand(2));
6216 }
6217 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6218 // (ballot 0) -> 0
6219 if (Arg->isZero())
6220 return DAG.getConstant(0, SL, VT);
6221
6222 // (ballot 1) -> EXEC/EXEC_LO
6223 if (Arg->isOne()) {
6224 Register Exec;
6225 if (VT.getScalarSizeInBits() == 32)
6226 Exec = AMDGPU::EXEC_LO;
6227 else if (VT.getScalarSizeInBits() == 64)
6228 Exec = AMDGPU::EXEC;
6229 else
6230 return SDValue();
6231
6232 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6233 }
6234 }
6235
6236 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6237 // ISD::SETNE)
6238 return DAG.getNode(
6239 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6240 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6241}
6242
6244 SelectionDAG &DAG) {
6245 EVT VT = N->getValueType(0);
6246 unsigned ValSize = VT.getSizeInBits();
6247 unsigned IID = N->getConstantOperandVal(0);
6248 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6249 IID == Intrinsic::amdgcn_permlanex16;
6250 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6251 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6252 SDLoc SL(N);
6253 MVT IntVT = MVT::getIntegerVT(ValSize);
6254 const GCNSubtarget *ST = TLI.getSubtarget();
6255 unsigned SplitSize = 32;
6256 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6257 ST->hasDPALU_DPP() &&
6258 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6259 SplitSize = 64;
6260
6261 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6262 SDValue Src2, MVT ValT) -> SDValue {
6264 switch (IID) {
6265 case Intrinsic::amdgcn_permlane16:
6266 case Intrinsic::amdgcn_permlanex16:
6267 case Intrinsic::amdgcn_update_dpp:
6268 Operands.push_back(N->getOperand(6));
6269 Operands.push_back(N->getOperand(5));
6270 Operands.push_back(N->getOperand(4));
6271 [[fallthrough]];
6272 case Intrinsic::amdgcn_writelane:
6273 Operands.push_back(Src2);
6274 [[fallthrough]];
6275 case Intrinsic::amdgcn_readlane:
6276 case Intrinsic::amdgcn_set_inactive:
6277 case Intrinsic::amdgcn_set_inactive_chain_arg:
6278 case Intrinsic::amdgcn_mov_dpp8:
6279 Operands.push_back(Src1);
6280 [[fallthrough]];
6281 case Intrinsic::amdgcn_readfirstlane:
6282 case Intrinsic::amdgcn_permlane64:
6283 Operands.push_back(Src0);
6284 break;
6285 default:
6286 llvm_unreachable("unhandled lane op");
6287 }
6288
6289 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6290 std::reverse(Operands.begin(), Operands.end());
6291
6292 if (SDNode *GL = N->getGluedNode()) {
6293 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6294 GL = GL->getOperand(0).getNode();
6295 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6296 SDValue(GL, 0)));
6297 }
6298
6299 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6300 };
6301
6302 SDValue Src0 = N->getOperand(1);
6303 SDValue Src1, Src2;
6304 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6305 IID == Intrinsic::amdgcn_mov_dpp8 ||
6306 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6307 Src1 = N->getOperand(2);
6308 if (IID == Intrinsic::amdgcn_writelane ||
6309 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6310 Src2 = N->getOperand(3);
6311 }
6312
6313 if (ValSize == SplitSize) {
6314 // Already legal
6315 return SDValue();
6316 }
6317
6318 if (ValSize < 32) {
6319 bool IsFloat = VT.isFloatingPoint();
6320 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6321 SL, MVT::i32);
6322
6323 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6324 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6325 SL, MVT::i32);
6326 }
6327
6328 if (IID == Intrinsic::amdgcn_writelane) {
6329 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6330 SL, MVT::i32);
6331 }
6332
6333 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6334 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6335 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6336 }
6337
6338 if (ValSize % SplitSize != 0)
6339 return SDValue();
6340
6341 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6342 EVT VT = N->getValueType(0);
6343 unsigned NE = VT.getVectorNumElements();
6344 EVT EltVT = VT.getVectorElementType();
6346 unsigned NumOperands = N->getNumOperands();
6347 SmallVector<SDValue, 4> Operands(NumOperands);
6348 SDNode *GL = N->getGluedNode();
6349
6350 // only handle convergencectrl_glue
6352
6353 for (unsigned i = 0; i != NE; ++i) {
6354 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6355 ++j) {
6356 SDValue Operand = N->getOperand(j);
6357 EVT OperandVT = Operand.getValueType();
6358 if (OperandVT.isVector()) {
6359 // A vector operand; extract a single element.
6360 EVT OperandEltVT = OperandVT.getVectorElementType();
6361 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6362 Operand, DAG.getVectorIdxConstant(i, SL));
6363 } else {
6364 // A scalar operand; just use it as is.
6365 Operands[j] = Operand;
6366 }
6367 }
6368
6369 if (GL)
6370 Operands[NumOperands - 1] =
6371 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6372 SDValue(GL->getOperand(0).getNode(), 0));
6373
6374 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6375 }
6376
6377 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6378 return DAG.getBuildVector(VecVT, SL, Scalars);
6379 };
6380
6381 if (VT.isVector()) {
6382 switch (MVT::SimpleValueType EltTy =
6384 case MVT::i32:
6385 case MVT::f32:
6386 if (SplitSize == 32) {
6387 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6388 return unrollLaneOp(LaneOp.getNode());
6389 }
6390 [[fallthrough]];
6391 case MVT::i16:
6392 case MVT::f16:
6393 case MVT::bf16: {
6394 unsigned SubVecNumElt =
6395 SplitSize / VT.getVectorElementType().getSizeInBits();
6396 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6398 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6399 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6400 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6401 DAG.getConstant(EltIdx, SL, MVT::i32));
6402
6403 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6404 IsPermLane16)
6405 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6406 DAG.getConstant(EltIdx, SL, MVT::i32));
6407
6408 if (IID == Intrinsic::amdgcn_writelane)
6409 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6410 DAG.getConstant(EltIdx, SL, MVT::i32));
6411
6412 Pieces.push_back(
6413 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6414 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6415 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6416 EltIdx += SubVecNumElt;
6417 }
6418 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6419 }
6420 default:
6421 // Handle all other cases by bitcasting to i32 vectors
6422 break;
6423 }
6424 }
6425
6426 MVT VecVT =
6427 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6428 Src0 = DAG.getBitcast(VecVT, Src0);
6429
6430 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6431 Src1 = DAG.getBitcast(VecVT, Src1);
6432
6433 if (IID == Intrinsic::amdgcn_writelane)
6434 Src2 = DAG.getBitcast(VecVT, Src2);
6435
6436 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6437 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6438 return DAG.getBitcast(VT, UnrolledLaneOp);
6439}
6440
6443 SelectionDAG &DAG) const {
6444 switch (N->getOpcode()) {
6446 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6447 Results.push_back(Res);
6448 return;
6449 }
6451 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6452 Results.push_back(Res);
6453 return;
6454 }
6456 unsigned IID = N->getConstantOperandVal(0);
6457 switch (IID) {
6458 case Intrinsic::amdgcn_make_buffer_rsrc:
6459 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6460 return;
6461 case Intrinsic::amdgcn_cvt_pkrtz: {
6462 SDValue Src0 = N->getOperand(1);
6463 SDValue Src1 = N->getOperand(2);
6464 SDLoc SL(N);
6465 SDValue Cvt =
6466 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6467 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6468 return;
6469 }
6470 case Intrinsic::amdgcn_cvt_pknorm_i16:
6471 case Intrinsic::amdgcn_cvt_pknorm_u16:
6472 case Intrinsic::amdgcn_cvt_pk_i16:
6473 case Intrinsic::amdgcn_cvt_pk_u16: {
6474 SDValue Src0 = N->getOperand(1);
6475 SDValue Src1 = N->getOperand(2);
6476 SDLoc SL(N);
6477 unsigned Opcode;
6478
6479 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6481 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6483 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6485 else
6487
6488 EVT VT = N->getValueType(0);
6489 if (isTypeLegal(VT))
6490 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6491 else {
6492 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6493 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6494 }
6495 return;
6496 }
6497 case Intrinsic::amdgcn_s_buffer_load: {
6498 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6499 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6500 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6501 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6502 // s_buffer_load_i8.
6503 if (!Subtarget->hasScalarSubwordLoads())
6504 return;
6505 SDValue Op = SDValue(N, 0);
6506 SDValue Rsrc = Op.getOperand(1);
6507 SDValue Offset = Op.getOperand(2);
6508 SDValue CachePolicy = Op.getOperand(3);
6509 EVT VT = Op.getValueType();
6510 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6511 SDLoc DL(Op);
6513 const DataLayout &DataLayout = DAG.getDataLayout();
6514 Align Alignment =
6520 VT.getStoreSize(), Alignment);
6521 SDValue LoadVal;
6522 if (!Offset->isDivergent()) {
6523 SDValue Ops[] = {Rsrc, // source register
6524 Offset, CachePolicy};
6525 SDValue BufferLoad =
6527 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6528 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6529 } else {
6530 SDValue Ops[] = {
6531 DAG.getEntryNode(), // Chain
6532 Rsrc, // rsrc
6533 DAG.getConstant(0, DL, MVT::i32), // vindex
6534 {}, // voffset
6535 {}, // soffset
6536 {}, // offset
6537 CachePolicy, // cachepolicy
6538 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6539 };
6540 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6541 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6542 }
6543 Results.push_back(LoadVal);
6544 return;
6545 }
6546 }
6547 break;
6548 }
6550 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6551 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6552 // FIXME: Hacky
6553 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6554 Results.push_back(Res.getOperand(I));
6555 }
6556 } else {
6557 Results.push_back(Res);
6558 Results.push_back(Res.getValue(1));
6559 }
6560 return;
6561 }
6562
6563 break;
6564 }
6565 case ISD::SELECT: {
6566 SDLoc SL(N);
6567 EVT VT = N->getValueType(0);
6568 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6569 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6570 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6571
6572 EVT SelectVT = NewVT;
6573 if (NewVT.bitsLT(MVT::i32)) {
6574 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6575 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6576 SelectVT = MVT::i32;
6577 }
6578
6579 SDValue NewSelect =
6580 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6581
6582 if (NewVT != SelectVT)
6583 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6584 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6585 return;
6586 }
6587 case ISD::FNEG: {
6588 if (N->getValueType(0) != MVT::v2f16)
6589 break;
6590
6591 SDLoc SL(N);
6592 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6593
6594 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6595 DAG.getConstant(0x80008000, SL, MVT::i32));
6596 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6597 return;
6598 }
6599 case ISD::FABS: {
6600 if (N->getValueType(0) != MVT::v2f16)
6601 break;
6602
6603 SDLoc SL(N);
6604 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6605
6606 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6607 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6608 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6609 return;
6610 }
6611 case ISD::FSQRT: {
6612 if (N->getValueType(0) != MVT::f16)
6613 break;
6614 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6615 break;
6616 }
6617 default:
6619 break;
6620 }
6621}
6622
6623/// Helper function for LowerBRCOND
6624static SDNode *findUser(SDValue Value, unsigned Opcode) {
6625
6626 for (SDUse &U : Value->uses()) {
6627 if (U.get() != Value)
6628 continue;
6629
6630 if (U.getUser()->getOpcode() == Opcode)
6631 return U.getUser();
6632 }
6633 return nullptr;
6634}
6635
6636unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6637 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6638 switch (Intr->getConstantOperandVal(1)) {
6639 case Intrinsic::amdgcn_if:
6640 return AMDGPUISD::IF;
6641 case Intrinsic::amdgcn_else:
6642 return AMDGPUISD::ELSE;
6643 case Intrinsic::amdgcn_loop:
6644 return AMDGPUISD::LOOP;
6645 case Intrinsic::amdgcn_end_cf:
6646 llvm_unreachable("should not occur");
6647 default:
6648 return 0;
6649 }
6650 }
6651
6652 // break, if_break, else_break are all only used as inputs to loop, not
6653 // directly as branch conditions.
6654 return 0;
6655}
6656
6658 const Triple &TT = getTargetMachine().getTargetTriple();
6662}
6663
6665 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6666 return false;
6667
6668 // FIXME: Either avoid relying on address space here or change the default
6669 // address space for functions to avoid the explicit check.
6670 return (GV->getValueType()->isFunctionTy() ||
6673}
6674
6676 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6677}
6678
6680 if (!GV->hasExternalLinkage())
6681 return true;
6682
6683 const auto OS = getTargetMachine().getTargetTriple().getOS();
6684 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6685}
6686
6687/// This transforms the control flow intrinsics to get the branch destination as
6688/// last parameter, also switches branch target with BR if the need arise
6689SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6690 SDLoc DL(BRCOND);
6691
6692 SDNode *Intr = BRCOND.getOperand(1).getNode();
6693 SDValue Target = BRCOND.getOperand(2);
6694 SDNode *BR = nullptr;
6695 SDNode *SetCC = nullptr;
6696
6697 if (Intr->getOpcode() == ISD::SETCC) {
6698 // As long as we negate the condition everything is fine
6699 SetCC = Intr;
6700 Intr = SetCC->getOperand(0).getNode();
6701
6702 } else {
6703 // Get the target from BR if we don't negate the condition
6704 BR = findUser(BRCOND, ISD::BR);
6705 assert(BR && "brcond missing unconditional branch user");
6706 Target = BR->getOperand(1);
6707 }
6708
6709 unsigned CFNode = isCFIntrinsic(Intr);
6710 if (CFNode == 0) {
6711 // This is a uniform branch so we don't need to legalize.
6712 return BRCOND;
6713 }
6714
6715 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6716 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6717
6718 assert(!SetCC ||
6719 (SetCC->getConstantOperandVal(1) == 1 &&
6720 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6721 ISD::SETNE));
6722
6723 // operands of the new intrinsic call
6725 if (HaveChain)
6726 Ops.push_back(BRCOND.getOperand(0));
6727
6728 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6729 Ops.push_back(Target);
6730
6731 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6732
6733 // build the new intrinsic call
6734 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6735
6736 if (!HaveChain) {
6737 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6738
6739 Result = DAG.getMergeValues(Ops, DL).getNode();
6740 }
6741
6742 if (BR) {
6743 // Give the branch instruction our target
6744 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6745 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6746 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6747 }
6748
6749 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6750
6751 // Copy the intrinsic results to registers
6752 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6754 if (!CopyToReg)
6755 continue;
6756
6757 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6758 SDValue(Result, i - 1), SDValue());
6759
6760 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6761 }
6762
6763 // Remove the old intrinsic from the chain
6764 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6765 Intr->getOperand(0));
6766
6767 return Chain;
6768}
6769
6770SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6771 MVT VT = Op.getSimpleValueType();
6772 SDLoc DL(Op);
6773 // Checking the depth
6774 if (Op.getConstantOperandVal(0) != 0)
6775 return DAG.getConstant(0, DL, VT);
6776
6779 // Check for kernel and shader functions
6780 if (Info->isEntryFunction())
6781 return DAG.getConstant(0, DL, VT);
6782
6783 MachineFrameInfo &MFI = MF.getFrameInfo();
6784 // There is a call to @llvm.returnaddress in this function
6785 MFI.setReturnAddressIsTaken(true);
6786
6788 // Get the return address reg and mark it as an implicit live-in
6789 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6790 getRegClassFor(VT, Op.getNode()->isDivergent()));
6791
6792 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6793}
6794
6795SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6796 const SDLoc &DL, EVT VT) const {
6797 return Op.getValueType().bitsLE(VT)
6798 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6799 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6800 DAG.getTargetConstant(0, DL, MVT::i32));
6801}
6802
6803SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6804 assert(Op.getValueType() == MVT::f16 &&
6805 "Do not know how to custom lower FP_ROUND for non-f16 type");
6806
6807 SDValue Src = Op.getOperand(0);
6808 EVT SrcVT = Src.getValueType();
6809 if (SrcVT != MVT::f64)
6810 return Op;
6811
6812 // TODO: Handle strictfp
6813 if (Op.getOpcode() != ISD::FP_ROUND)
6814 return Op;
6815
6816 SDLoc DL(Op);
6817
6818 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6819 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6820 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6821}
6822
6823SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6824 SelectionDAG &DAG) const {
6825 EVT VT = Op.getValueType();
6826 const MachineFunction &MF = DAG.getMachineFunction();
6828 bool IsIEEEMode = Info->getMode().IEEE;
6829
6830 // FIXME: Assert during selection that this is only selected for
6831 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6832 // mode functions, but this happens to be OK since it's only done in cases
6833 // where there is known no sNaN.
6834 if (IsIEEEMode)
6835 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6836
6837 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6838 VT == MVT::v16bf16)
6839 return splitBinaryVectorOp(Op, DAG);
6840 return Op;
6841}
6842
6843SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6844 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6845 EVT VT = Op.getValueType();
6846 assert(VT == MVT::f16);
6847
6848 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6849 EVT ExpVT = Exp.getValueType();
6850 if (ExpVT == MVT::i16)
6851 return Op;
6852
6853 SDLoc DL(Op);
6854
6855 // Correct the exponent type for f16 to i16.
6856 // Clamp the range of the exponent to the instruction's range.
6857
6858 // TODO: This should be a generic narrowing legalization, and can easily be
6859 // for GlobalISel.
6860
6861 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6862 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6863
6864 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6865 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6866
6867 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6868
6869 if (IsStrict) {
6870 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6871 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6872 }
6873
6874 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6875}
6876
6878 switch (Op->getOpcode()) {
6879 case ISD::SRA:
6880 case ISD::SMIN:
6881 case ISD::SMAX:
6882 return ISD::SIGN_EXTEND;
6883 case ISD::SRL:
6884 case ISD::UMIN:
6885 case ISD::UMAX:
6886 return ISD::ZERO_EXTEND;
6887 case ISD::ADD:
6888 case ISD::SUB:
6889 case ISD::AND:
6890 case ISD::OR:
6891 case ISD::XOR:
6892 case ISD::SHL:
6893 case ISD::SELECT:
6894 case ISD::MUL:
6895 // operation result won't be influenced by garbage high bits.
6896 // TODO: are all of those cases correct, and are there more?
6897 return ISD::ANY_EXTEND;
6898 case ISD::SETCC: {
6899 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6901 }
6902 default:
6903 llvm_unreachable("unexpected opcode!");
6904 }
6905}
6906
6907SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6908 DAGCombinerInfo &DCI) const {
6909 const unsigned Opc = Op.getOpcode();
6910 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6911 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6912 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6913 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6914 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6915
6916 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6917 : Op->getOperand(0).getValueType();
6918 auto ExtTy = OpTy.changeElementType(MVT::i32);
6919
6920 if (DCI.isBeforeLegalizeOps() ||
6921 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6922 return SDValue();
6923
6924 auto &DAG = DCI.DAG;
6925
6926 SDLoc DL(Op);
6927 SDValue LHS;
6928 SDValue RHS;
6929 if (Opc == ISD::SELECT) {
6930 LHS = Op->getOperand(1);
6931 RHS = Op->getOperand(2);
6932 } else {
6933 LHS = Op->getOperand(0);
6934 RHS = Op->getOperand(1);
6935 }
6936
6937 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6938 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6939
6940 // Special case: for shifts, the RHS always needs a zext.
6941 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6942 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6943 else
6944 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6945
6946 // setcc always return i1/i1 vec so no need to truncate after.
6947 if (Opc == ISD::SETCC) {
6948 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6949 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6950 }
6951
6952 // For other ops, we extend the operation's return type as well so we need to
6953 // truncate back to the original type.
6954 SDValue NewVal;
6955 if (Opc == ISD::SELECT)
6956 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6957 else
6958 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6959
6960 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6961}
6962
6963// Custom lowering for vector multiplications and s_mul_u64.
6964SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6965 EVT VT = Op.getValueType();
6966
6967 // Split vector operands.
6968 if (VT.isVector())
6969 return splitBinaryVectorOp(Op, DAG);
6970
6971 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6972
6973 // There are four ways to lower s_mul_u64:
6974 //
6975 // 1. If all the operands are uniform, then we lower it as it is.
6976 //
6977 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6978 // multiplications because there is not a vector equivalent of s_mul_u64.
6979 //
6980 // 3. If the cost model decides that it is more efficient to use vector
6981 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6982 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6983 //
6984 // 4. If the cost model decides to use vector registers and both of the
6985 // operands are zero-extended/sign-extended from 32-bits, then we split the
6986 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6987 // possible to check if the operands are zero-extended or sign-extended in
6988 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6989 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6990 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6991 // If the cost model decides that we have to use vector registers, then
6992 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6993 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6994 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6995 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6996 // SIInstrInfo.cpp .
6997
6998 if (Op->isDivergent())
6999 return SDValue();
7000
7001 SDValue Op0 = Op.getOperand(0);
7002 SDValue Op1 = Op.getOperand(1);
7003 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7004 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7005 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7006 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7007 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7008 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7009 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7010 SDLoc SL(Op);
7011 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7012 return SDValue(
7013 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7014 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7015 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7016 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7017 return SDValue(
7018 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7019 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7020 return Op;
7021}
7022
7023SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7024 EVT VT = Op.getValueType();
7025 SDLoc SL(Op);
7026 SDValue LHS = Op.getOperand(0);
7027 SDValue RHS = Op.getOperand(1);
7028 bool isSigned = Op.getOpcode() == ISD::SMULO;
7029
7030 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7031 const APInt &C = RHSC->getAPIntValue();
7032 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7033 if (C.isPowerOf2()) {
7034 // smulo(x, signed_min) is same as umulo(x, signed_min).
7035 bool UseArithShift = isSigned && !C.isMinSignedValue();
7036 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7037 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7038 SDValue Overflow =
7039 DAG.getSetCC(SL, MVT::i1,
7040 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7041 Result, ShiftAmt),
7042 LHS, ISD::SETNE);
7043 return DAG.getMergeValues({Result, Overflow}, SL);
7044 }
7045 }
7046
7047 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7048 SDValue Top =
7049 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7050
7051 SDValue Sign = isSigned
7052 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7053 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7054 SL, MVT::i32))
7055 : DAG.getConstant(0, SL, VT);
7056 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7057
7058 return DAG.getMergeValues({Result, Overflow}, SL);
7059}
7060
7061SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7062 if (Op->isDivergent()) {
7063 // Select to V_MAD_[IU]64_[IU]32.
7064 return Op;
7065 }
7066 if (Subtarget->hasSMulHi()) {
7067 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7068 return SDValue();
7069 }
7070 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7071 // calculate the high part, so we might as well do the whole thing with
7072 // V_MAD_[IU]64_[IU]32.
7073 return Op;
7074}
7075
7076SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7077 if (!Subtarget->isTrapHandlerEnabled() ||
7079 return lowerTrapEndpgm(Op, DAG);
7080
7081 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7082 : lowerTrapHsaQueuePtr(Op, DAG);
7083}
7084
7085SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7086 SDLoc SL(Op);
7087 SDValue Chain = Op.getOperand(0);
7088 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7089}
7090
7091SDValue
7092SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7093 const SDLoc &DL, Align Alignment,
7094 ImplicitParameter Param) const {
7097 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7099 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7102}
7103
7104SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7105 SelectionDAG &DAG) const {
7106 SDLoc SL(Op);
7107 SDValue Chain = Op.getOperand(0);
7108
7109 SDValue QueuePtr;
7110 // For code object version 5, QueuePtr is passed through implicit kernarg.
7111 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7113 QueuePtr =
7114 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7115 } else {
7118 Register UserSGPR = Info->getQueuePtrUserSGPR();
7119
7120 if (UserSGPR == AMDGPU::NoRegister) {
7121 // We probably are in a function incorrectly marked with
7122 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7123 // trap, so just use a null pointer.
7124 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7125 } else {
7126 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7127 MVT::i64);
7128 }
7129 }
7130
7131 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7132 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7133
7135 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7136 ToReg.getValue(1)};
7137 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7138}
7139
7140SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7141 SDLoc SL(Op);
7142 SDValue Chain = Op.getOperand(0);
7143
7144 // We need to simulate the 's_trap 2' instruction on targets that run in
7145 // PRIV=1 (where it is treated as a nop).
7146 if (Subtarget->hasPrivEnabledTrap2NopBug())
7147 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7148
7150 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7151 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7152}
7153
7154SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7155 SDLoc SL(Op);
7156 SDValue Chain = Op.getOperand(0);
7158
7159 if (!Subtarget->isTrapHandlerEnabled() ||
7162 "debugtrap handler not supported",
7163 Op.getDebugLoc(), DS_Warning);
7164 LLVMContext &Ctx = MF.getFunction().getContext();
7165 Ctx.diagnose(NoTrap);
7166 return Chain;
7167 }
7168
7169 uint64_t TrapID =
7171 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7172 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7173}
7174
7175SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7176 SelectionDAG &DAG) const {
7177 if (Subtarget->hasApertureRegs()) {
7178 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7179 ? AMDGPU::SRC_SHARED_BASE
7180 : AMDGPU::SRC_PRIVATE_BASE;
7181 // Note: this feature (register) is broken. When used as a 32-bit operand,
7182 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7183 // bits.
7184 //
7185 // To work around the issue, directly emit a 64 bit mov from this register
7186 // then extract the high bits. Note that this shouldn't even result in a
7187 // shift being emitted and simply become a pair of registers (e.g.):
7188 // s_mov_b64 s[6:7], src_shared_base
7189 // v_mov_b32_e32 v1, s7
7190 //
7191 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7192 // coalescing would kick in and it would think it's okay to use the "HI"
7193 // subregister directly (instead of extracting the HI 32 bits) which is an
7194 // artificial (unusable) register.
7195 // Register TableGen definitions would need an overhaul to get rid of the
7196 // artificial "HI" aperture registers and prevent this kind of issue from
7197 // happening.
7198 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7199 DAG.getRegister(ApertureRegNo, MVT::i64));
7200 return DAG.getNode(
7201 ISD::TRUNCATE, DL, MVT::i32,
7202 DAG.getNode(ISD::SRL, DL, MVT::i64,
7203 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7204 }
7205
7206 // For code object version 5, private_base and shared_base are passed through
7207 // implicit kernargs.
7208 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7212 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7213 }
7214
7217 Register UserSGPR = Info->getQueuePtrUserSGPR();
7218 if (UserSGPR == AMDGPU::NoRegister) {
7219 // We probably are in a function incorrectly marked with
7220 // amdgpu-no-queue-ptr. This is undefined.
7221 return DAG.getUNDEF(MVT::i32);
7222 }
7223
7224 SDValue QueuePtr =
7225 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7226
7227 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7228 // private_segment_aperture_base_hi.
7229 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7230
7231 SDValue Ptr =
7232 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7233
7234 // TODO: Use custom target PseudoSourceValue.
7235 // TODO: We should use the value from the IR intrinsic call, but it might not
7236 // be available and how do we get it?
7238 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7239 commonAlignment(Align(64), StructOffset),
7242}
7243
7244/// Return true if the value is a known valid address, such that a null check is
7245/// not necessary.
7247 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7248 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7249 isa<BasicBlockSDNode>(Val))
7250 return true;
7251
7252 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7253 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7254
7255 // TODO: Search through arithmetic, handle arguments and loads
7256 // marked nonnull.
7257 return false;
7258}
7259
7260SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7261 SelectionDAG &DAG) const {
7262 SDLoc SL(Op);
7263
7264 const AMDGPUTargetMachine &TM =
7265 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7266
7267 unsigned DestAS, SrcAS;
7268 SDValue Src;
7269 bool IsNonNull = false;
7270 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7271 SrcAS = ASC->getSrcAddressSpace();
7272 Src = ASC->getOperand(0);
7273 DestAS = ASC->getDestAddressSpace();
7274 } else {
7275 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7276 Op.getConstantOperandVal(0) ==
7277 Intrinsic::amdgcn_addrspacecast_nonnull);
7278 Src = Op->getOperand(1);
7279 SrcAS = Op->getConstantOperandVal(2);
7280 DestAS = Op->getConstantOperandVal(3);
7281 IsNonNull = true;
7282 }
7283
7284 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7285
7286 // flat -> local/private
7287 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7288 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7289 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7290 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7291
7292 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7293 return Ptr;
7294
7295 unsigned NullVal = TM.getNullPointerValue(DestAS);
7296 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7297 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7298
7299 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7300 SegmentNullPtr);
7301 }
7302 }
7303
7304 // local/private -> flat
7305 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7306 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7307 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7308
7309 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7310 SDValue CvtPtr =
7311 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7312 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7313
7314 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7315 return CvtPtr;
7316
7317 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7318 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7319
7320 SDValue NonNull =
7321 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7322
7323 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7324 FlatNullPtr);
7325 }
7326 }
7327
7328 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7329 Op.getValueType() == MVT::i64) {
7332 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7333 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7334 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7335 }
7336
7337 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7338 Src.getValueType() == MVT::i64)
7339 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7340
7341 // global <-> flat are no-ops and never emitted.
7342
7343 const MachineFunction &MF = DAG.getMachineFunction();
7344 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7345 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7346 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7347
7348 return DAG.getUNDEF(Op->getValueType(0));
7349}
7350
7351// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7352// the small vector and inserting them into the big vector. That is better than
7353// the default expansion of doing it via a stack slot. Even though the use of
7354// the stack slot would be optimized away afterwards, the stack slot itself
7355// remains.
7356SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7357 SelectionDAG &DAG) const {
7358 SDValue Vec = Op.getOperand(0);
7359 SDValue Ins = Op.getOperand(1);
7360 SDValue Idx = Op.getOperand(2);
7361 EVT VecVT = Vec.getValueType();
7362 EVT InsVT = Ins.getValueType();
7363 EVT EltVT = VecVT.getVectorElementType();
7364 unsigned InsNumElts = InsVT.getVectorNumElements();
7365 unsigned IdxVal = Idx->getAsZExtVal();
7366 SDLoc SL(Op);
7367
7368 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7369 // Insert 32-bit registers at a time.
7370 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7371
7372 unsigned VecNumElts = VecVT.getVectorNumElements();
7373 EVT NewVecVT =
7374 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7375 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7377 MVT::i32, InsNumElts / 2);
7378
7379 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7380 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7381
7382 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7383 SDValue Elt;
7384 if (InsNumElts == 2) {
7385 Elt = Ins;
7386 } else {
7387 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7388 DAG.getConstant(I, SL, MVT::i32));
7389 }
7390 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7391 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7392 }
7393
7394 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7395 }
7396
7397 for (unsigned I = 0; I != InsNumElts; ++I) {
7398 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7399 DAG.getConstant(I, SL, MVT::i32));
7400 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7401 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7402 }
7403 return Vec;
7404}
7405
7406SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7407 SelectionDAG &DAG) const {
7408 SDValue Vec = Op.getOperand(0);
7409 SDValue InsVal = Op.getOperand(1);
7410 SDValue Idx = Op.getOperand(2);
7411 EVT VecVT = Vec.getValueType();
7412 EVT EltVT = VecVT.getVectorElementType();
7413 unsigned VecSize = VecVT.getSizeInBits();
7414 unsigned EltSize = EltVT.getSizeInBits();
7415 SDLoc SL(Op);
7416
7417 // Specially handle the case of v4i16 with static indexing.
7418 unsigned NumElts = VecVT.getVectorNumElements();
7419 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7420 if (NumElts == 4 && EltSize == 16 && KIdx) {
7421 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7422
7423 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7424 DAG.getConstant(0, SL, MVT::i32));
7425 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7426 DAG.getConstant(1, SL, MVT::i32));
7427
7428 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7429 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7430
7431 unsigned Idx = KIdx->getZExtValue();
7432 bool InsertLo = Idx < 2;
7433 SDValue InsHalf = DAG.getNode(
7434 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7435 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7436 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7437
7438 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7439
7440 SDValue Concat =
7441 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7442 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7443
7444 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7445 }
7446
7447 // Static indexing does not lower to stack access, and hence there is no need
7448 // for special custom lowering to avoid stack access.
7449 if (isa<ConstantSDNode>(Idx))
7450 return SDValue();
7451
7452 // Avoid stack access for dynamic indexing by custom lowering to
7453 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7454
7455 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7456
7457 MVT IntVT = MVT::getIntegerVT(VecSize);
7458
7459 // Convert vector index to bit-index and get the required bit mask.
7460 assert(isPowerOf2_32(EltSize));
7461 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7462 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7463 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7464 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7465 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7466
7467 // 1. Create a congruent vector with the target value in each element.
7468 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7469 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7470
7471 // 2. Mask off all other indices except the required index within (1).
7472 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7473
7474 // 3. Mask off the required index within the target vector.
7475 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7476 SDValue RHS =
7477 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7478
7479 // 4. Get (2) and (3) ORed into the target vector.
7480 SDValue BFI =
7481 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7482
7483 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7484}
7485
7486SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7487 SelectionDAG &DAG) const {
7488 SDLoc SL(Op);
7489
7490 EVT ResultVT = Op.getValueType();
7491 SDValue Vec = Op.getOperand(0);
7492 SDValue Idx = Op.getOperand(1);
7493 EVT VecVT = Vec.getValueType();
7494 unsigned VecSize = VecVT.getSizeInBits();
7495 EVT EltVT = VecVT.getVectorElementType();
7496
7497 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7498
7499 // Make sure we do any optimizations that will make it easier to fold
7500 // source modifiers before obscuring it with bit operations.
7501
7502 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7503 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7504 return Combined;
7505
7506 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7507 SDValue Lo, Hi;
7508 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7509
7510 if (VecSize == 128) {
7511 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7512 Lo = DAG.getBitcast(LoVT,
7513 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7514 DAG.getConstant(0, SL, MVT::i32)));
7515 Hi = DAG.getBitcast(HiVT,
7516 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7517 DAG.getConstant(1, SL, MVT::i32)));
7518 } else if (VecSize == 256) {
7519 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7520 SDValue Parts[4];
7521 for (unsigned P = 0; P < 4; ++P) {
7522 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7523 DAG.getConstant(P, SL, MVT::i32));
7524 }
7525
7526 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7527 Parts[0], Parts[1]));
7528 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7529 Parts[2], Parts[3]));
7530 } else {
7531 assert(VecSize == 512);
7532
7533 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7534 SDValue Parts[8];
7535 for (unsigned P = 0; P < 8; ++P) {
7536 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7537 DAG.getConstant(P, SL, MVT::i32));
7538 }
7539
7540 Lo = DAG.getBitcast(LoVT,
7541 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7542 Parts[0], Parts[1], Parts[2], Parts[3]));
7543 Hi = DAG.getBitcast(HiVT,
7544 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7545 Parts[4], Parts[5], Parts[6], Parts[7]));
7546 }
7547
7548 EVT IdxVT = Idx.getValueType();
7549 unsigned NElem = VecVT.getVectorNumElements();
7550 assert(isPowerOf2_32(NElem));
7551 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7552 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7553 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7555 }
7556
7557 assert(VecSize <= 64);
7558
7559 MVT IntVT = MVT::getIntegerVT(VecSize);
7560
7561 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7562 SDValue VecBC = peekThroughBitcasts(Vec);
7563 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7564 SDValue Src = VecBC.getOperand(0);
7565 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7566 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7567 }
7568
7569 unsigned EltSize = EltVT.getSizeInBits();
7570 assert(isPowerOf2_32(EltSize));
7571
7572 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7573
7574 // Convert vector index to bit-index (* EltSize)
7575 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7576
7577 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7578 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7579
7580 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7581 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7582 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7583 }
7584
7585 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7586}
7587
7588static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7589 assert(Elt % 2 == 0);
7590 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7591}
7592
7593static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
7594 assert(Elt % 2 == 0);
7595 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
7596 !(Mask[Elt + 1] & 1);
7597}
7598
7599SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7600 SelectionDAG &DAG) const {
7601 SDLoc SL(Op);
7602 EVT ResultVT = Op.getValueType();
7603 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7604 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7605 const int NewSrcNumElts = 2;
7606 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
7607 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7608
7609 // Break up the shuffle into registers sized pieces.
7610 //
7611 // We're trying to form sub-shuffles that the register allocation pipeline
7612 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
7613 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
7614 // pair of copies into a consecutive register copy, so use the ordinary
7615 // extract_vector_elt lowering unless we can use the shuffle.
7616 //
7617 // TODO: This is a bit of hack, and we should probably always use
7618 // extract_subvector for the largest possible subvector we can (or at least
7619 // use it for PackVT aligned pieces). However we have worse support for
7620 // combines on them don't directly treat extract_subvector / insert_subvector
7621 // as legal. The DAG scheduler also ends up doing a worse job with the
7622 // extract_subvectors.
7623 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
7624
7625 // vector_shuffle <0,1,6,7> lhs, rhs
7626 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7627 //
7628 // vector_shuffle <6,7,2,3> lhs, rhs
7629 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7630 //
7631 // vector_shuffle <6,7,0,1> lhs, rhs
7632 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7633
7634 // Avoid scalarizing when both halves are reading from consecutive elements.
7635
7636 // If we're treating 2 element shuffles as legal, also create odd-to-even
7637 // shuffles of neighboring pairs.
7638 //
7639 // vector_shuffle <3,2,7,6> lhs, rhs
7640 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
7641 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
7642
7644 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7645 if (ShouldUseConsecutiveExtract &&
7647 const int Idx = SVN->getMaskElt(I);
7648 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7649 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7650 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7651 SVN->getOperand(VecIdx),
7652 DAG.getConstant(EltIdx, SL, MVT::i32));
7653 Pieces.push_back(SubVec);
7654 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
7656 int Idx0 = SVN->getMaskElt(I);
7657 int Idx1 = SVN->getMaskElt(I + 1);
7658
7659 SDValue SrcOp0 = SVN->getOperand(0);
7660 SDValue SrcOp1 = SrcOp0;
7661 if (Idx0 >= SrcNumElts) {
7662 SrcOp0 = SVN->getOperand(1);
7663 Idx0 -= SrcNumElts;
7664 }
7665
7666 if (Idx1 >= SrcNumElts) {
7667 SrcOp1 = SVN->getOperand(1);
7668 Idx1 -= SrcNumElts;
7669 }
7670
7671 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
7672 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
7673
7674 // Extract nearest even aligned piece.
7675 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
7676 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
7677 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
7678 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
7679
7680 int NewMaskIdx0 = Idx0 - AlignedIdx0;
7681 int NewMaskIdx1 = Idx1 - AlignedIdx1;
7682
7683 SDValue Result0 = SubVec0;
7684 SDValue Result1 = SubVec0;
7685
7686 if (SubVec0 != SubVec1) {
7687 NewMaskIdx1 += NewSrcNumElts;
7688 Result1 = SubVec1;
7689 } else {
7690 Result1 = DAG.getUNDEF(PackVT);
7691 }
7692
7693 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
7694 {NewMaskIdx0, NewMaskIdx1});
7695 Pieces.push_back(Shuf);
7696 } else {
7697 const int Idx0 = SVN->getMaskElt(I);
7698 const int Idx1 = SVN->getMaskElt(I + 1);
7699 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7700 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7701 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7702 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7703
7704 SDValue Vec0 = SVN->getOperand(VecIdx0);
7705 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7706 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7707
7708 SDValue Vec1 = SVN->getOperand(VecIdx1);
7709 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7710 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7711 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7712 }
7713 }
7714
7715 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7716}
7717
7718SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7719 SelectionDAG &DAG) const {
7720 SDValue SVal = Op.getOperand(0);
7721 EVT ResultVT = Op.getValueType();
7722 EVT SValVT = SVal.getValueType();
7723 SDValue UndefVal = DAG.getUNDEF(SValVT);
7724 SDLoc SL(Op);
7725
7727 VElts.push_back(SVal);
7728 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7729 VElts.push_back(UndefVal);
7730
7731 return DAG.getBuildVector(ResultVT, SL, VElts);
7732}
7733
7734SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7735 SelectionDAG &DAG) const {
7736 SDLoc SL(Op);
7737 EVT VT = Op.getValueType();
7738
7739 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7740 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7741
7742 SDValue Lo = Op.getOperand(0);
7743 SDValue Hi = Op.getOperand(1);
7744
7745 // Avoid adding defined bits with the zero_extend.
7746 if (Hi.isUndef()) {
7747 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7748 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7749 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7750 }
7751
7752 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7753 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7754
7755 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7756 DAG.getConstant(16, SL, MVT::i32));
7757 if (Lo.isUndef())
7758 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7759
7760 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7761 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7762
7763 SDValue Or =
7764 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7765 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7766 }
7767
7768 // Split into 2-element chunks.
7769 const unsigned NumParts = VT.getVectorNumElements() / 2;
7771 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7772
7774 for (unsigned P = 0; P < NumParts; ++P) {
7775 SDValue Vec = DAG.getBuildVector(
7776 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7777 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7778 }
7779
7780 SDValue Blend =
7781 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7782 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7783}
7784
7786 const GlobalAddressSDNode *GA) const {
7787 // OSes that use ELF REL relocations (instead of RELA) can only store a
7788 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7789 // which can create arbitrary 64-bit addends. (This is only a problem for
7790 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7791 // the high 32 bits of the addend.)
7792 //
7793 // This should be kept in sync with how HasRelocationAddend is initialized in
7794 // the constructor of ELFAMDGPUAsmBackend.
7795 if (!Subtarget->isAmdHsaOS())
7796 return false;
7797
7798 // We can fold offsets for anything that doesn't require a GOT relocation.
7799 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7803}
7804
7805static SDValue
7807 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7808 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7809 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7810 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7811 // lowered to the following code sequence:
7812 //
7813 // For constant address space:
7814 // s_getpc_b64 s[0:1]
7815 // s_add_u32 s0, s0, $symbol
7816 // s_addc_u32 s1, s1, 0
7817 //
7818 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7819 // a fixup or relocation is emitted to replace $symbol with a literal
7820 // constant, which is a pc-relative offset from the encoding of the $symbol
7821 // operand to the global variable.
7822 //
7823 // For global address space:
7824 // s_getpc_b64 s[0:1]
7825 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7826 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7827 //
7828 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7829 // fixups or relocations are emitted to replace $symbol@*@lo and
7830 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7831 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7832 // operand to the global variable.
7833 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7834 SDValue PtrHi;
7835 if (GAFlags == SIInstrInfo::MO_NONE)
7836 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7837 else
7838 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7839 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7840}
7841
7842SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7843 SDValue Op,
7844 SelectionDAG &DAG) const {
7845 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7846 SDLoc DL(GSD);
7847 EVT PtrVT = Op.getValueType();
7848
7849 const GlobalValue *GV = GSD->getGlobal();
7855 GV->hasExternalLinkage()) {
7856 Type *Ty = GV->getValueType();
7857 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7858 // zero-sized type in other languages to declare the dynamic shared
7859 // memory which size is not known at the compile time. They will be
7860 // allocated by the runtime and placed directly after the static
7861 // allocated ones. They all share the same offset.
7862 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7863 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7864 // Adjust alignment for that dynamic shared memory array.
7866 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7867 MFI->setUsesDynamicLDS(true);
7868 return SDValue(
7869 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7870 }
7871 }
7873 }
7874
7876 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7878 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7879 }
7880
7881 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7882 SDValue AddrLo = DAG.getTargetGlobalAddress(
7883 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7884 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7885
7886 SDValue AddrHi = DAG.getTargetGlobalAddress(
7887 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7888 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7889
7890 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7891 }
7892
7893 if (shouldEmitFixup(GV))
7894 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7895
7896 if (shouldEmitPCReloc(GV))
7897 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7899
7900 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7902 PointerType *PtrTy =
7904 const DataLayout &DataLayout = DAG.getDataLayout();
7905 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7906 MachinePointerInfo PtrInfo =
7908
7909 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7912}
7913
7915 const SDLoc &DL, SDValue V) const {
7916 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7917 // the destination register.
7918 //
7919 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7920 // so we will end up with redundant moves to m0.
7921 //
7922 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7923
7924 // A Null SDValue creates a glue result.
7925 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7926 V, Chain);
7927 return SDValue(M0, 0);
7928}
7929
7930SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7931 MVT VT,
7932 unsigned Offset) const {
7933 SDLoc SL(Op);
7934 SDValue Param = lowerKernargMemParameter(
7935 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7936 // The local size values will have the hi 16-bits as zero.
7937 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7938 DAG.getValueType(VT));
7939}
7940
7942 EVT VT) {
7944 "non-hsa intrinsic with hsa target",
7945 DL.getDebugLoc());
7946 DAG.getContext()->diagnose(BadIntrin);
7947 return DAG.getUNDEF(VT);
7948}
7949
7951 EVT VT) {
7953 "intrinsic not supported on subtarget",
7954 DL.getDebugLoc());
7955 DAG.getContext()->diagnose(BadIntrin);
7956 return DAG.getUNDEF(VT);
7957}
7958
7960 ArrayRef<SDValue> Elts) {
7961 assert(!Elts.empty());
7962 MVT Type;
7963 unsigned NumElts = Elts.size();
7964
7965 if (NumElts <= 12) {
7966 Type = MVT::getVectorVT(MVT::f32, NumElts);
7967 } else {
7968 assert(Elts.size() <= 16);
7969 Type = MVT::v16f32;
7970 NumElts = 16;
7971 }
7972
7973 SmallVector<SDValue, 16> VecElts(NumElts);
7974 for (unsigned i = 0; i < Elts.size(); ++i) {
7975 SDValue Elt = Elts[i];
7976 if (Elt.getValueType() != MVT::f32)
7977 Elt = DAG.getBitcast(MVT::f32, Elt);
7978 VecElts[i] = Elt;
7979 }
7980 for (unsigned i = Elts.size(); i < NumElts; ++i)
7981 VecElts[i] = DAG.getUNDEF(MVT::f32);
7982
7983 if (NumElts == 1)
7984 return VecElts[0];
7985 return DAG.getBuildVector(Type, DL, VecElts);
7986}
7987
7988static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7989 SDValue Src, int ExtraElts) {
7990 EVT SrcVT = Src.getValueType();
7991
7993
7994 if (SrcVT.isVector())
7995 DAG.ExtractVectorElements(Src, Elts);
7996 else
7997 Elts.push_back(Src);
7998
7999 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
8000 while (ExtraElts--)
8001 Elts.push_back(Undef);
8002
8003 return DAG.getBuildVector(CastVT, DL, Elts);
8004}
8005
8006// Re-construct the required return value for a image load intrinsic.
8007// This is more complicated due to the optional use TexFailCtrl which means the
8008// required return type is an aggregate
8010 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8011 bool Unpacked, bool IsD16, int DMaskPop,
8012 int NumVDataDwords, bool IsAtomicPacked16Bit,
8013 const SDLoc &DL) {
8014 // Determine the required return type. This is the same regardless of
8015 // IsTexFail flag
8016 EVT ReqRetVT = ResultTypes[0];
8017 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8018 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8019 ? (ReqRetNumElts + 1) / 2
8020 : ReqRetNumElts;
8021
8022 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8023
8024 MVT DataDwordVT =
8025 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8026
8027 MVT MaskPopVT =
8028 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8029
8030 SDValue Data(Result, 0);
8031 SDValue TexFail;
8032
8033 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8034 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8035 if (MaskPopVT.isVector()) {
8036 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8037 SDValue(Result, 0), ZeroIdx);
8038 } else {
8039 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8040 SDValue(Result, 0), ZeroIdx);
8041 }
8042 }
8043
8044 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8045 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8046 NumDataDwords - MaskPopDwords);
8047
8048 if (IsD16)
8049 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8050
8051 EVT LegalReqRetVT = ReqRetVT;
8052 if (!ReqRetVT.isVector()) {
8053 if (!Data.getValueType().isInteger())
8054 Data = DAG.getNode(ISD::BITCAST, DL,
8055 Data.getValueType().changeTypeToInteger(), Data);
8056 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8057 } else {
8058 // We need to widen the return vector to a legal type
8059 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8060 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8061 LegalReqRetVT =
8063 ReqRetVT.getVectorNumElements() + 1);
8064 }
8065 }
8066 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8067
8068 if (IsTexFail) {
8069 TexFail =
8070 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8071 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8072
8073 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8074 }
8075
8076 if (Result->getNumValues() == 1)
8077 return Data;
8078
8079 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8080}
8081
8082static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8083 SDValue *LWE, bool &IsTexFail) {
8084 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8085
8086 uint64_t Value = TexFailCtrlConst->getZExtValue();
8087 if (Value) {
8088 IsTexFail = true;
8089 }
8090
8091 SDLoc DL(TexFailCtrlConst);
8092 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8093 Value &= ~(uint64_t)0x1;
8094 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8095 Value &= ~(uint64_t)0x2;
8096
8097 return Value == 0;
8098}
8099
8101 MVT PackVectorVT,
8102 SmallVectorImpl<SDValue> &PackedAddrs,
8103 unsigned DimIdx, unsigned EndIdx,
8104 unsigned NumGradients) {
8105 SDLoc DL(Op);
8106 for (unsigned I = DimIdx; I < EndIdx; I++) {
8107 SDValue Addr = Op.getOperand(I);
8108
8109 // Gradients are packed with undef for each coordinate.
8110 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8111 // 1D: undef,dx/dh; undef,dx/dv
8112 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8113 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8114 if (((I + 1) >= EndIdx) ||
8115 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8116 I == DimIdx + NumGradients - 1))) {
8117 if (Addr.getValueType() != MVT::i16)
8118 Addr = DAG.getBitcast(MVT::i16, Addr);
8119 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8120 } else {
8121 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8122 I++;
8123 }
8124 Addr = DAG.getBitcast(MVT::f32, Addr);
8125 PackedAddrs.push_back(Addr);
8126 }
8127}
8128
8129SDValue SITargetLowering::lowerImage(SDValue Op,
8131 SelectionDAG &DAG, bool WithChain) const {
8132 SDLoc DL(Op);
8134 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8135 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8137 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8138 unsigned IntrOpcode = Intr->BaseOpcode;
8139 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8140 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8141 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8142
8143 SmallVector<EVT, 3> ResultTypes(Op->values());
8144 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8145 bool IsD16 = false;
8146 bool IsG16 = false;
8147 bool IsA16 = false;
8148 SDValue VData;
8149 int NumVDataDwords = 0;
8150 bool AdjustRetType = false;
8151 bool IsAtomicPacked16Bit = false;
8152
8153 // Offset of intrinsic arguments
8154 const unsigned ArgOffset = WithChain ? 2 : 1;
8155
8156 unsigned DMask;
8157 unsigned DMaskLanes = 0;
8158
8159 if (BaseOpcode->Atomic) {
8160 VData = Op.getOperand(2);
8161
8162 IsAtomicPacked16Bit =
8163 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8164 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8165
8166 bool Is64Bit = VData.getValueSizeInBits() == 64;
8167 if (BaseOpcode->AtomicX2) {
8168 SDValue VData2 = Op.getOperand(3);
8169 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8170 {VData, VData2});
8171 if (Is64Bit)
8172 VData = DAG.getBitcast(MVT::v4i32, VData);
8173
8174 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8175 DMask = Is64Bit ? 0xf : 0x3;
8176 NumVDataDwords = Is64Bit ? 4 : 2;
8177 } else {
8178 DMask = Is64Bit ? 0x3 : 0x1;
8179 NumVDataDwords = Is64Bit ? 2 : 1;
8180 }
8181 } else {
8182 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8183 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8184
8185 if (BaseOpcode->Store) {
8186 VData = Op.getOperand(2);
8187
8188 MVT StoreVT = VData.getSimpleValueType();
8189 if (StoreVT.getScalarType() == MVT::f16) {
8190 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8191 return Op; // D16 is unsupported for this instruction
8192
8193 IsD16 = true;
8194 VData = handleD16VData(VData, DAG, true);
8195 }
8196
8197 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8198 } else if (!BaseOpcode->NoReturn) {
8199 // Work out the num dwords based on the dmask popcount and underlying type
8200 // and whether packing is supported.
8201 MVT LoadVT = ResultTypes[0].getSimpleVT();
8202 if (LoadVT.getScalarType() == MVT::f16) {
8203 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8204 return Op; // D16 is unsupported for this instruction
8205
8206 IsD16 = true;
8207 }
8208
8209 // Confirm that the return type is large enough for the dmask specified
8210 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8211 (!LoadVT.isVector() && DMaskLanes > 1))
8212 return Op;
8213
8214 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8215 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8216 // instructions.
8217 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8218 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8219 NumVDataDwords = (DMaskLanes + 1) / 2;
8220 else
8221 NumVDataDwords = DMaskLanes;
8222
8223 AdjustRetType = true;
8224 }
8225 }
8226
8227 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8229
8230 // Check for 16 bit addresses or derivatives and pack if true.
8231 MVT VAddrVT =
8232 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8233 MVT VAddrScalarVT = VAddrVT.getScalarType();
8234 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8235 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8236
8237 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8238 VAddrScalarVT = VAddrVT.getScalarType();
8239 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8240 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8241
8242 // Push back extra arguments.
8243 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8244 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8245 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8246 // Special handling of bias when A16 is on. Bias is of type half but
8247 // occupies full 32-bit.
8248 SDValue Bias = DAG.getBuildVector(
8249 MVT::v2f16, DL,
8250 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8251 VAddrs.push_back(Bias);
8252 } else {
8253 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8254 "Bias needs to be converted to 16 bit in A16 mode");
8255 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8256 }
8257 }
8258
8259 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8260 // 16 bit gradients are supported, but are tied to the A16 control
8261 // so both gradients and addresses must be 16 bit
8262 LLVM_DEBUG(
8263 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8264 "require 16 bit args for both gradients and addresses");
8265 return Op;
8266 }
8267
8268 if (IsA16) {
8269 if (!ST->hasA16()) {
8270 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8271 "support 16 bit addresses\n");
8272 return Op;
8273 }
8274 }
8275
8276 // We've dealt with incorrect input so we know that if IsA16, IsG16
8277 // are set then we have to compress/pack operands (either address,
8278 // gradient or both)
8279 // In the case where a16 and gradients are tied (no G16 support) then we
8280 // have already verified that both IsA16 and IsG16 are true
8281 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8282 // Activate g16
8283 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8285 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8286 }
8287
8288 // Add gradients (packed or unpacked)
8289 if (IsG16) {
8290 // Pack the gradients
8291 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8292 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8293 ArgOffset + Intr->GradientStart,
8294 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8295 } else {
8296 for (unsigned I = ArgOffset + Intr->GradientStart;
8297 I < ArgOffset + Intr->CoordStart; I++)
8298 VAddrs.push_back(Op.getOperand(I));
8299 }
8300
8301 // Add addresses (packed or unpacked)
8302 if (IsA16) {
8303 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8304 ArgOffset + Intr->CoordStart, VAddrEnd,
8305 0 /* No gradients */);
8306 } else {
8307 // Add uncompressed address
8308 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8309 VAddrs.push_back(Op.getOperand(I));
8310 }
8311
8312 // If the register allocator cannot place the address registers contiguously
8313 // without introducing moves, then using the non-sequential address encoding
8314 // is always preferable, since it saves VALU instructions and is usually a
8315 // wash in terms of code size or even better.
8316 //
8317 // However, we currently have no way of hinting to the register allocator that
8318 // MIMG addresses should be placed contiguously when it is possible to do so,
8319 // so force non-NSA for the common 2-address case as a heuristic.
8320 //
8321 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8322 // allocation when possible.
8323 //
8324 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8325 // set of the remaining addresses.
8326 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8327 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8328 const bool UseNSA = ST->hasNSAEncoding() &&
8329 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8330 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8331 const bool UsePartialNSA =
8332 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8333
8334 SDValue VAddr;
8335 if (UsePartialNSA) {
8336 VAddr = getBuildDwordsVector(DAG, DL,
8337 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8338 } else if (!UseNSA) {
8339 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8340 }
8341
8342 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8343 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8344 SDValue Unorm;
8345 if (!BaseOpcode->Sampler) {
8346 Unorm = True;
8347 } else {
8348 uint64_t UnormConst =
8349 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8350
8351 Unorm = UnormConst ? True : False;
8352 }
8353
8354 SDValue TFE;
8355 SDValue LWE;
8356 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8357 bool IsTexFail = false;
8358 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8359 return Op;
8360
8361 if (IsTexFail) {
8362 if (!DMaskLanes) {
8363 // Expecting to get an error flag since TFC is on - and dmask is 0
8364 // Force dmask to be at least 1 otherwise the instruction will fail
8365 DMask = 0x1;
8366 DMaskLanes = 1;
8367 NumVDataDwords = 1;
8368 }
8369 NumVDataDwords += 1;
8370 AdjustRetType = true;
8371 }
8372
8373 // Has something earlier tagged that the return type needs adjusting
8374 // This happens if the instruction is a load or has set TexFailCtrl flags
8375 if (AdjustRetType) {
8376 // NumVDataDwords reflects the true number of dwords required in the return
8377 // type
8378 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8379 // This is a no-op load. This can be eliminated
8380 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8381 if (isa<MemSDNode>(Op))
8382 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8383 return Undef;
8384 }
8385
8386 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8387 MVT::i32, NumVDataDwords)
8388 : MVT::i32;
8389
8390 ResultTypes[0] = NewVT;
8391 if (ResultTypes.size() == 3) {
8392 // Original result was aggregate type used for TexFailCtrl results
8393 // The actual instruction returns as a vector type which has now been
8394 // created. Remove the aggregate result.
8395 ResultTypes.erase(&ResultTypes[1]);
8396 }
8397 }
8398
8399 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8400 if (BaseOpcode->Atomic)
8401 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8402 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8404 return Op;
8405
8407 if (BaseOpcode->Store || BaseOpcode->Atomic)
8408 Ops.push_back(VData); // vdata
8409 if (UsePartialNSA) {
8410 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8411 Ops.push_back(VAddr);
8412 } else if (UseNSA)
8413 append_range(Ops, VAddrs);
8414 else
8415 Ops.push_back(VAddr);
8416 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8417 EVT RsrcVT = Rsrc.getValueType();
8418 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8419 return Op;
8420 Ops.push_back(Rsrc);
8421 if (BaseOpcode->Sampler) {
8422 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8423 if (Samp.getValueType() != MVT::v4i32)
8424 return Op;
8425 Ops.push_back(Samp);
8426 }
8427 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8428 if (IsGFX10Plus)
8429 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8430 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8431 Ops.push_back(Unorm);
8432 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8433 Ops.push_back(IsA16 && // r128, a16 for gfx9
8434 ST->hasFeature(AMDGPU::FeatureR128A16)
8435 ? True
8436 : False);
8437 if (IsGFX10Plus)
8438 Ops.push_back(IsA16 ? True : False);
8439 if (!Subtarget->hasGFX90AInsts()) {
8440 Ops.push_back(TFE); // tfe
8441 } else if (TFE->getAsZExtVal()) {
8442 report_fatal_error("TFE is not supported on this GPU");
8443 }
8444 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8445 Ops.push_back(LWE); // lwe
8446 if (!IsGFX10Plus)
8447 Ops.push_back(DimInfo->DA ? True : False);
8448 if (BaseOpcode->HasD16)
8449 Ops.push_back(IsD16 ? True : False);
8450 if (isa<MemSDNode>(Op))
8451 Ops.push_back(Op.getOperand(0)); // chain
8452
8453 int NumVAddrDwords =
8454 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8455 int Opcode = -1;
8456
8457 if (IsGFX12Plus) {
8458 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8459 NumVDataDwords, NumVAddrDwords);
8460 } else if (IsGFX11Plus) {
8461 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8462 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8463 : AMDGPU::MIMGEncGfx11Default,
8464 NumVDataDwords, NumVAddrDwords);
8465 } else if (IsGFX10Plus) {
8466 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8467 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8468 : AMDGPU::MIMGEncGfx10Default,
8469 NumVDataDwords, NumVAddrDwords);
8470 } else {
8471 if (Subtarget->hasGFX90AInsts()) {
8472 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8473 NumVDataDwords, NumVAddrDwords);
8474 if (Opcode == -1)
8476 "requested image instruction is not supported on this GPU");
8477 }
8478 if (Opcode == -1 &&
8480 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8481 NumVDataDwords, NumVAddrDwords);
8482 if (Opcode == -1)
8483 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8484 NumVDataDwords, NumVAddrDwords);
8485 }
8486 if (Opcode == -1)
8487 return Op;
8488
8489 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8490 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8491 MachineMemOperand *MemRef = MemOp->getMemOperand();
8492 DAG.setNodeMemRefs(NewNode, {MemRef});
8493 }
8494
8495 if (BaseOpcode->AtomicX2) {
8497 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8498 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8499 }
8500 if (BaseOpcode->NoReturn)
8501 return SDValue(NewNode, 0);
8502 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8503 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8504 NumVDataDwords, IsAtomicPacked16Bit, DL);
8505}
8506
8507SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8508 SDValue Offset, SDValue CachePolicy,
8509 SelectionDAG &DAG) const {
8511
8512 const DataLayout &DataLayout = DAG.getDataLayout();
8513 Align Alignment =
8515
8520 VT.getStoreSize(), Alignment);
8521
8522 if (!Offset->isDivergent()) {
8523 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8524
8525 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8526 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8527 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8528 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8529 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8530 SDValue BufferLoad =
8532 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8533 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8534 }
8535
8536 // Widen vec3 load to vec4.
8537 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8538 !Subtarget->hasScalarDwordx3Loads()) {
8539 EVT WidenedVT =
8541 auto WidenedOp = DAG.getMemIntrinsicNode(
8542 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8543 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8544 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8545 DAG.getVectorIdxConstant(0, DL));
8546 return Subvector;
8547 }
8548
8550 DAG.getVTList(VT), Ops, VT, MMO);
8551 }
8552
8553 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8554 // assume that the buffer is unswizzled.
8555 SDValue Ops[] = {
8556 DAG.getEntryNode(), // Chain
8557 Rsrc, // rsrc
8558 DAG.getConstant(0, DL, MVT::i32), // vindex
8559 {}, // voffset
8560 {}, // soffset
8561 {}, // offset
8562 CachePolicy, // cachepolicy
8563 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8564 };
8565 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8566 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8567 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8568 }
8569
8571 unsigned NumLoads = 1;
8572 MVT LoadVT = VT.getSimpleVT();
8573 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8574 assert((LoadVT.getScalarType() == MVT::i32 ||
8575 LoadVT.getScalarType() == MVT::f32));
8576
8577 if (NumElts == 8 || NumElts == 16) {
8578 NumLoads = NumElts / 4;
8579 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8580 }
8581
8582 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8583
8584 // Use the alignment to ensure that the required offsets will fit into the
8585 // immediate offsets.
8586 setBufferOffsets(Offset, DAG, &Ops[3],
8587 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8588
8589 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8590 for (unsigned i = 0; i < NumLoads; ++i) {
8591 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8592 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8593 LoadVT, MMO, DAG));
8594 }
8595
8596 if (NumElts == 8 || NumElts == 16)
8597 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8598
8599 return Loads[0];
8600}
8601
8602SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8603 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8604 if (!Subtarget->hasArchitectedSGPRs())
8605 return {};
8606 SDLoc SL(Op);
8607 MVT VT = MVT::i32;
8608 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8609 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8610 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8611}
8612
8613SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8614 unsigned Dim,
8615 const ArgDescriptor &Arg) const {
8616 SDLoc SL(Op);
8618 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8619 if (MaxID == 0)
8620 return DAG.getConstant(0, SL, MVT::i32);
8621
8622 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8623 SDLoc(DAG.getEntryNode()), Arg);
8624
8625 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8626 // masking operations anyway.
8627 //
8628 // TODO: We could assert the top bit is 0 for the source copy.
8629 if (Arg.isMasked())
8630 return Val;
8631
8632 // Preserve the known bits after expansion to a copy.
8634 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8635 DAG.getValueType(SmallVT));
8636}
8637
8638SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8639 SelectionDAG &DAG) const {
8641 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8642
8643 EVT VT = Op.getValueType();
8644 SDLoc DL(Op);
8645 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8646
8647 // TODO: Should this propagate fast-math-flags?
8648
8649 switch (IntrinsicID) {
8650 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8651 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8652 return emitNonHSAIntrinsicError(DAG, DL, VT);
8653 return getPreloadedValue(DAG, *MFI, VT,
8655 }
8656 case Intrinsic::amdgcn_dispatch_ptr:
8657 case Intrinsic::amdgcn_queue_ptr: {
8658 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8659 DiagnosticInfoUnsupported BadIntrin(
8660 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8661 DL.getDebugLoc());
8662 DAG.getContext()->diagnose(BadIntrin);
8663 return DAG.getUNDEF(VT);
8664 }
8665
8666 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8669 return getPreloadedValue(DAG, *MFI, VT, RegID);
8670 }
8671 case Intrinsic::amdgcn_implicitarg_ptr: {
8672 if (MFI->isEntryFunction())
8673 return getImplicitArgPtr(DAG, DL);
8674 return getPreloadedValue(DAG, *MFI, VT,
8676 }
8677 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8679 // This only makes sense to call in a kernel, so just lower to null.
8680 return DAG.getConstant(0, DL, VT);
8681 }
8682
8683 return getPreloadedValue(DAG, *MFI, VT,
8685 }
8686 case Intrinsic::amdgcn_dispatch_id: {
8687 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8688 }
8689 case Intrinsic::amdgcn_rcp:
8690 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8691 case Intrinsic::amdgcn_rsq:
8692 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8693 case Intrinsic::amdgcn_rsq_legacy:
8695 return emitRemovedIntrinsicError(DAG, DL, VT);
8696 return SDValue();
8697 case Intrinsic::amdgcn_rcp_legacy:
8699 return emitRemovedIntrinsicError(DAG, DL, VT);
8700 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8701 case Intrinsic::amdgcn_rsq_clamp: {
8703 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8704
8705 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8708
8709 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8710 SDValue Tmp =
8711 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8712 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8713 DAG.getConstantFP(Min, DL, VT));
8714 }
8715 case Intrinsic::r600_read_ngroups_x:
8716 if (Subtarget->isAmdHsaOS())
8717 return emitNonHSAIntrinsicError(DAG, DL, VT);
8718
8719 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8721 false);
8722 case Intrinsic::r600_read_ngroups_y:
8723 if (Subtarget->isAmdHsaOS())
8724 return emitNonHSAIntrinsicError(DAG, DL, VT);
8725
8726 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8728 false);
8729 case Intrinsic::r600_read_ngroups_z:
8730 if (Subtarget->isAmdHsaOS())
8731 return emitNonHSAIntrinsicError(DAG, DL, VT);
8732
8733 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8735 false);
8736 case Intrinsic::r600_read_global_size_x:
8737 if (Subtarget->isAmdHsaOS())
8738 return emitNonHSAIntrinsicError(DAG, DL, VT);
8739
8740 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8742 Align(4), false);
8743 case Intrinsic::r600_read_global_size_y:
8744 if (Subtarget->isAmdHsaOS())
8745 return emitNonHSAIntrinsicError(DAG, DL, VT);
8746
8747 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8749 Align(4), false);
8750 case Intrinsic::r600_read_global_size_z:
8751 if (Subtarget->isAmdHsaOS())
8752 return emitNonHSAIntrinsicError(DAG, DL, VT);
8753
8754 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8756 Align(4), false);
8757 case Intrinsic::r600_read_local_size_x:
8758 if (Subtarget->isAmdHsaOS())
8759 return emitNonHSAIntrinsicError(DAG, DL, VT);
8760
8761 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8763 case Intrinsic::r600_read_local_size_y:
8764 if (Subtarget->isAmdHsaOS())
8765 return emitNonHSAIntrinsicError(DAG, DL, VT);
8766
8767 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8769 case Intrinsic::r600_read_local_size_z:
8770 if (Subtarget->isAmdHsaOS())
8771 return emitNonHSAIntrinsicError(DAG, DL, VT);
8772
8773 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8775 case Intrinsic::amdgcn_workgroup_id_x:
8776 return getPreloadedValue(DAG, *MFI, VT,
8778 case Intrinsic::amdgcn_workgroup_id_y:
8779 return getPreloadedValue(DAG, *MFI, VT,
8781 case Intrinsic::amdgcn_workgroup_id_z:
8782 return getPreloadedValue(DAG, *MFI, VT,
8784 case Intrinsic::amdgcn_wave_id:
8785 return lowerWaveID(DAG, Op);
8786 case Intrinsic::amdgcn_lds_kernel_id: {
8787 if (MFI->isEntryFunction())
8788 return getLDSKernelId(DAG, DL);
8789 return getPreloadedValue(DAG, *MFI, VT,
8791 }
8792 case Intrinsic::amdgcn_workitem_id_x:
8793 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8794 case Intrinsic::amdgcn_workitem_id_y:
8795 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8796 case Intrinsic::amdgcn_workitem_id_z:
8797 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8798 case Intrinsic::amdgcn_wavefrontsize:
8800 SDLoc(Op), MVT::i32);
8801 case Intrinsic::amdgcn_s_buffer_load: {
8802 unsigned CPol = Op.getConstantOperandVal(3);
8803 // s_buffer_load, because of how it's optimized, can't be volatile
8804 // so reject ones with the volatile bit set.
8805 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8808 return Op;
8809 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8810 Op.getOperand(3), DAG);
8811 }
8812 case Intrinsic::amdgcn_fdiv_fast:
8813 return lowerFDIV_FAST(Op, DAG);
8814 case Intrinsic::amdgcn_sin:
8815 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8816
8817 case Intrinsic::amdgcn_cos:
8818 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8819
8820 case Intrinsic::amdgcn_mul_u24:
8821 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8822 Op.getOperand(2));
8823 case Intrinsic::amdgcn_mul_i24:
8824 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8825 Op.getOperand(2));
8826
8827 case Intrinsic::amdgcn_log_clamp: {
8829 return SDValue();
8830
8831 return emitRemovedIntrinsicError(DAG, DL, VT);
8832 }
8833 case Intrinsic::amdgcn_fract:
8834 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8835
8836 case Intrinsic::amdgcn_class:
8837 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8838 Op.getOperand(2));
8839 case Intrinsic::amdgcn_div_fmas:
8840 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8841 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8842
8843 case Intrinsic::amdgcn_div_fixup:
8844 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8845 Op.getOperand(2), Op.getOperand(3));
8846
8847 case Intrinsic::amdgcn_div_scale: {
8848 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8849
8850 // Translate to the operands expected by the machine instruction. The
8851 // first parameter must be the same as the first instruction.
8852 SDValue Numerator = Op.getOperand(1);
8853 SDValue Denominator = Op.getOperand(2);
8854
8855 // Note this order is opposite of the machine instruction's operations,
8856 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8857 // intrinsic has the numerator as the first operand to match a normal
8858 // division operation.
8859
8860 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8861
8862 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8863 Denominator, Numerator);
8864 }
8865 case Intrinsic::amdgcn_icmp: {
8866 // There is a Pat that handles this variant, so return it as-is.
8867 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8868 Op.getConstantOperandVal(2) == 0 &&
8869 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8870 return Op;
8871 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8872 }
8873 case Intrinsic::amdgcn_fcmp: {
8874 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8875 }
8876 case Intrinsic::amdgcn_ballot:
8877 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8878 case Intrinsic::amdgcn_fmed3:
8879 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8880 Op.getOperand(2), Op.getOperand(3));
8881 case Intrinsic::amdgcn_fdot2:
8882 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8883 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8884 case Intrinsic::amdgcn_fmul_legacy:
8885 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8886 Op.getOperand(2));
8887 case Intrinsic::amdgcn_sffbh:
8888 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8889 case Intrinsic::amdgcn_sbfe:
8890 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8891 Op.getOperand(2), Op.getOperand(3));
8892 case Intrinsic::amdgcn_ubfe:
8893 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8894 Op.getOperand(2), Op.getOperand(3));
8895 case Intrinsic::amdgcn_cvt_pkrtz:
8896 case Intrinsic::amdgcn_cvt_pknorm_i16:
8897 case Intrinsic::amdgcn_cvt_pknorm_u16:
8898 case Intrinsic::amdgcn_cvt_pk_i16:
8899 case Intrinsic::amdgcn_cvt_pk_u16: {
8900 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8901 EVT VT = Op.getValueType();
8902 unsigned Opcode;
8903
8904 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8906 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8908 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8910 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8912 else
8914
8915 if (isTypeLegal(VT))
8916 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8917
8918 SDValue Node =
8919 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8920 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8921 }
8922 case Intrinsic::amdgcn_fmad_ftz:
8923 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8924 Op.getOperand(2), Op.getOperand(3));
8925
8926 case Intrinsic::amdgcn_if_break:
8927 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8928 Op->getOperand(1), Op->getOperand(2)),
8929 0);
8930
8931 case Intrinsic::amdgcn_groupstaticsize: {
8933 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8934 return Op;
8935
8936 const Module *M = MF.getFunction().getParent();
8937 const GlobalValue *GV =
8938 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8939 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8941 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8942 }
8943 case Intrinsic::amdgcn_is_shared:
8944 case Intrinsic::amdgcn_is_private: {
8945 SDLoc SL(Op);
8946 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8949 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8950 SDValue SrcVec =
8951 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8952
8953 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8954 DAG.getConstant(1, SL, MVT::i32));
8955 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8956 }
8957 case Intrinsic::amdgcn_perm:
8958 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8959 Op.getOperand(2), Op.getOperand(3));
8960 case Intrinsic::amdgcn_reloc_constant: {
8961 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8962 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8963 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8964 auto *RelocSymbol = cast<GlobalVariable>(
8965 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8966 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8968 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8969 }
8970 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8971 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8972 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8973 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8974 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8975 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8976 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8977 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8978 if (Op.getOperand(4).getValueType() == MVT::i32)
8979 return SDValue();
8980
8981 SDLoc SL(Op);
8982 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8983 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8984 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8985 Op.getOperand(3), IndexKeyi32);
8986 }
8987 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8988 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8989 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8990 if (Op.getOperand(6).getValueType() == MVT::i32)
8991 return SDValue();
8992
8993 SDLoc SL(Op);
8994 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8995 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8996 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8997 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8998 IndexKeyi32, Op.getOperand(7)});
8999 }
9000 case Intrinsic::amdgcn_addrspacecast_nonnull:
9001 return lowerADDRSPACECAST(Op, DAG);
9002 case Intrinsic::amdgcn_readlane:
9003 case Intrinsic::amdgcn_readfirstlane:
9004 case Intrinsic::amdgcn_writelane:
9005 case Intrinsic::amdgcn_permlane16:
9006 case Intrinsic::amdgcn_permlanex16:
9007 case Intrinsic::amdgcn_permlane64:
9008 case Intrinsic::amdgcn_set_inactive:
9009 case Intrinsic::amdgcn_set_inactive_chain_arg:
9010 case Intrinsic::amdgcn_mov_dpp8:
9011 case Intrinsic::amdgcn_update_dpp:
9012 return lowerLaneOp(*this, Op.getNode(), DAG);
9013 default:
9014 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9016 return lowerImage(Op, ImageDimIntr, DAG, false);
9017
9018 return Op;
9019 }
9020}
9021
9022// On targets not supporting constant in soffset field, turn zero to
9023// SGPR_NULL to avoid generating an extra s_mov with zero.
9025 const GCNSubtarget *Subtarget) {
9026 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9027 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9028 return SOffset;
9029}
9030
9031SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9032 SelectionDAG &DAG,
9033 unsigned NewOpcode) const {
9034 SDLoc DL(Op);
9035
9036 SDValue VData = Op.getOperand(2);
9037 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9038 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9039 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9040 SDValue Ops[] = {
9041 Op.getOperand(0), // Chain
9042 VData, // vdata
9043 Rsrc, // rsrc
9044 DAG.getConstant(0, DL, MVT::i32), // vindex
9045 VOffset, // voffset
9046 SOffset, // soffset
9047 Offset, // offset
9048 Op.getOperand(6), // cachepolicy
9049 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9050 };
9051
9052 auto *M = cast<MemSDNode>(Op);
9053
9054 EVT MemVT = VData.getValueType();
9055 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9056 M->getMemOperand());
9057}
9058
9059SDValue
9060SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9061 unsigned NewOpcode) const {
9062 SDLoc DL(Op);
9063
9064 SDValue VData = Op.getOperand(2);
9065 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9066 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9067 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9068 SDValue Ops[] = {
9069 Op.getOperand(0), // Chain
9070 VData, // vdata
9071 Rsrc, // rsrc
9072 Op.getOperand(4), // vindex
9073 VOffset, // voffset
9074 SOffset, // soffset
9075 Offset, // offset
9076 Op.getOperand(7), // cachepolicy
9077 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9078 };
9079
9080 auto *M = cast<MemSDNode>(Op);
9081
9082 EVT MemVT = VData.getValueType();
9083 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9084 M->getMemOperand());
9085}
9086
9087SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9088 SelectionDAG &DAG) const {
9089 unsigned IntrID = Op.getConstantOperandVal(1);
9090 SDLoc DL(Op);
9091
9092 switch (IntrID) {
9093 case Intrinsic::amdgcn_ds_ordered_add:
9094 case Intrinsic::amdgcn_ds_ordered_swap: {
9095 MemSDNode *M = cast<MemSDNode>(Op);
9096 SDValue Chain = M->getOperand(0);
9097 SDValue M0 = M->getOperand(2);
9098 SDValue Value = M->getOperand(3);
9099 unsigned IndexOperand = M->getConstantOperandVal(7);
9100 unsigned WaveRelease = M->getConstantOperandVal(8);
9101 unsigned WaveDone = M->getConstantOperandVal(9);
9102
9103 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9104 IndexOperand &= ~0x3f;
9105 unsigned CountDw = 0;
9106
9107 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9108 CountDw = (IndexOperand >> 24) & 0xf;
9109 IndexOperand &= ~(0xf << 24);
9110
9111 if (CountDw < 1 || CountDw > 4) {
9113 "ds_ordered_count: dword count must be between 1 and 4");
9114 }
9115 }
9116
9117 if (IndexOperand)
9118 report_fatal_error("ds_ordered_count: bad index operand");
9119
9120 if (WaveDone && !WaveRelease)
9121 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9122
9123 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9124 unsigned ShaderType =
9126 unsigned Offset0 = OrderedCountIndex << 2;
9127 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9128
9129 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9130 Offset1 |= (CountDw - 1) << 6;
9131
9132 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9133 Offset1 |= ShaderType << 2;
9134
9135 unsigned Offset = Offset0 | (Offset1 << 8);
9136
9137 SDValue Ops[] = {
9138 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9139 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9140 };
9142 M->getVTList(), Ops, M->getMemoryVT(),
9143 M->getMemOperand());
9144 }
9145 case Intrinsic::amdgcn_raw_buffer_load:
9146 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9147 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9148 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9149 case Intrinsic::amdgcn_raw_buffer_load_format:
9150 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9151 const bool IsFormat =
9152 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9153 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9154
9155 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9156 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9157 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9158 SDValue Ops[] = {
9159 Op.getOperand(0), // Chain
9160 Rsrc, // rsrc
9161 DAG.getConstant(0, DL, MVT::i32), // vindex
9162 VOffset, // voffset
9163 SOffset, // soffset
9164 Offset, // offset
9165 Op.getOperand(5), // cachepolicy, swizzled buffer
9166 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9167 };
9168
9169 auto *M = cast<MemSDNode>(Op);
9170 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9171 }
9172 case Intrinsic::amdgcn_struct_buffer_load:
9173 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9174 case Intrinsic::amdgcn_struct_buffer_load_format:
9175 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9176 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9177 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9178 const bool IsFormat =
9179 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9180 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9181
9182 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9183 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9184 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9185 SDValue Ops[] = {
9186 Op.getOperand(0), // Chain
9187 Rsrc, // rsrc
9188 Op.getOperand(3), // vindex
9189 VOffset, // voffset
9190 SOffset, // soffset
9191 Offset, // offset
9192 Op.getOperand(6), // cachepolicy, swizzled buffer
9193 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9194 };
9195
9196 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9197 }
9198 case Intrinsic::amdgcn_raw_tbuffer_load:
9199 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9200 MemSDNode *M = cast<MemSDNode>(Op);
9201 EVT LoadVT = Op.getValueType();
9202 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9203 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9204 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9205
9206 SDValue Ops[] = {
9207 Op.getOperand(0), // Chain
9208 Rsrc, // rsrc
9209 DAG.getConstant(0, DL, MVT::i32), // vindex
9210 VOffset, // voffset
9211 SOffset, // soffset
9212 Offset, // offset
9213 Op.getOperand(5), // format
9214 Op.getOperand(6), // cachepolicy, swizzled buffer
9215 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9216 };
9217
9218 if (LoadVT.getScalarType() == MVT::f16)
9219 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9220 Ops);
9221 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9222 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9223 DAG);
9224 }
9225 case Intrinsic::amdgcn_struct_tbuffer_load:
9226 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9227 MemSDNode *M = cast<MemSDNode>(Op);
9228 EVT LoadVT = Op.getValueType();
9229 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9230 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9231 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9232
9233 SDValue Ops[] = {
9234 Op.getOperand(0), // Chain
9235 Rsrc, // rsrc
9236 Op.getOperand(3), // vindex
9237 VOffset, // voffset
9238 SOffset, // soffset
9239 Offset, // offset
9240 Op.getOperand(6), // format
9241 Op.getOperand(7), // cachepolicy, swizzled buffer
9242 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9243 };
9244
9245 if (LoadVT.getScalarType() == MVT::f16)
9246 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9247 Ops);
9248 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9249 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9250 DAG);
9251 }
9252 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9253 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9254 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9255 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9257 return lowerStructBufferAtomicIntrin(Op, DAG,
9259 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9260 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9261 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9262 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9263 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9264 return lowerStructBufferAtomicIntrin(Op, DAG,
9266 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9268 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9269 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9271 return lowerStructBufferAtomicIntrin(Op, DAG,
9273 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9274 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9275 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9276 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9277 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9278 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9279 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9280 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9281 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9282 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9284 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9285 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9286 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9287 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9288 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9289 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9290 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9291 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9292 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9293 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9294 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9296 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9297 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9298 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9299 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9300 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9301 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9302 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9303 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9305 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9306 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9307 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9308 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9309 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9310 return lowerRawBufferAtomicIntrin(Op, DAG,
9312 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9313 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9314 return lowerStructBufferAtomicIntrin(Op, DAG,
9316 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9317 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9318 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9319 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9320 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9321 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9322 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9324 return lowerStructBufferAtomicIntrin(Op, DAG,
9326 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9327 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9328 return lowerStructBufferAtomicIntrin(Op, DAG,
9330 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9331 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9332 return lowerStructBufferAtomicIntrin(Op, DAG,
9334 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9336 return lowerStructBufferAtomicIntrin(Op, DAG,
9338 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9340 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9341 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9342 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9343 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9344 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9345 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9346 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9347 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9348 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9349 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9350 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9352 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9353 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9354 return lowerStructBufferAtomicIntrin(Op, DAG,
9356
9357 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9358 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9359 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9360 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9361 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9362 SDValue Ops[] = {
9363 Op.getOperand(0), // Chain
9364 Op.getOperand(2), // src
9365 Op.getOperand(3), // cmp
9366 Rsrc, // rsrc
9367 DAG.getConstant(0, DL, MVT::i32), // vindex
9368 VOffset, // voffset
9369 SOffset, // soffset
9370 Offset, // offset
9371 Op.getOperand(7), // cachepolicy
9372 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9373 };
9374 EVT VT = Op.getValueType();
9375 auto *M = cast<MemSDNode>(Op);
9376
9378 Op->getVTList(), Ops, VT,
9379 M->getMemOperand());
9380 }
9381 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9382 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9383 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9384 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9385 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9386 SDValue Ops[] = {
9387 Op.getOperand(0), // Chain
9388 Op.getOperand(2), // src
9389 Op.getOperand(3), // cmp
9390 Rsrc, // rsrc
9391 Op.getOperand(5), // vindex
9392 VOffset, // voffset
9393 SOffset, // soffset
9394 Offset, // offset
9395 Op.getOperand(8), // cachepolicy
9396 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9397 };
9398 EVT VT = Op.getValueType();
9399 auto *M = cast<MemSDNode>(Op);
9400
9402 Op->getVTList(), Ops, VT,
9403 M->getMemOperand());
9404 }
9405 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9406 MemSDNode *M = cast<MemSDNode>(Op);
9407 SDValue NodePtr = M->getOperand(2);
9408 SDValue RayExtent = M->getOperand(3);
9409 SDValue RayOrigin = M->getOperand(4);
9410 SDValue RayDir = M->getOperand(5);
9411 SDValue RayInvDir = M->getOperand(6);
9412 SDValue TDescr = M->getOperand(7);
9413
9414 assert(NodePtr.getValueType() == MVT::i32 ||
9415 NodePtr.getValueType() == MVT::i64);
9416 assert(RayDir.getValueType() == MVT::v3f16 ||
9417 RayDir.getValueType() == MVT::v3f32);
9418
9419 if (!Subtarget->hasGFX10_AEncoding()) {
9420 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9421 return SDValue();
9422 }
9423
9424 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9425 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9426 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9427 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9428 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9429 const unsigned NumVDataDwords = 4;
9430 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9431 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9432 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9433 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9434 IsGFX12Plus;
9435 const unsigned BaseOpcodes[2][2] = {
9436 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9437 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9438 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9439 int Opcode;
9440 if (UseNSA) {
9441 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9442 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9443 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9444 : AMDGPU::MIMGEncGfx10NSA,
9445 NumVDataDwords, NumVAddrDwords);
9446 } else {
9447 assert(!IsGFX12Plus);
9448 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9449 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9450 : AMDGPU::MIMGEncGfx10Default,
9451 NumVDataDwords, NumVAddrDwords);
9452 }
9453 assert(Opcode != -1);
9454
9456
9457 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9459 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9460 if (Lanes[0].getValueSizeInBits() == 32) {
9461 for (unsigned I = 0; I < 3; ++I)
9462 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9463 } else {
9464 if (IsAligned) {
9465 Ops.push_back(DAG.getBitcast(
9466 MVT::i32,
9467 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9468 Ops.push_back(Lanes[2]);
9469 } else {
9470 SDValue Elt0 = Ops.pop_back_val();
9471 Ops.push_back(DAG.getBitcast(
9472 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9473 Ops.push_back(DAG.getBitcast(
9474 MVT::i32,
9475 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9476 }
9477 }
9478 };
9479
9480 if (UseNSA && IsGFX11Plus) {
9481 Ops.push_back(NodePtr);
9482 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9483 Ops.push_back(RayOrigin);
9484 if (IsA16) {
9485 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9486 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9487 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9488 for (unsigned I = 0; I < 3; ++I) {
9489 MergedLanes.push_back(DAG.getBitcast(
9490 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9491 {DirLanes[I], InvDirLanes[I]})));
9492 }
9493 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9494 } else {
9495 Ops.push_back(RayDir);
9496 Ops.push_back(RayInvDir);
9497 }
9498 } else {
9499 if (Is64)
9500 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9501 2);
9502 else
9503 Ops.push_back(NodePtr);
9504
9505 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9506 packLanes(RayOrigin, true);
9507 packLanes(RayDir, true);
9508 packLanes(RayInvDir, false);
9509 }
9510
9511 if (!UseNSA) {
9512 // Build a single vector containing all the operands so far prepared.
9513 if (NumVAddrDwords > 12) {
9514 SDValue Undef = DAG.getUNDEF(MVT::i32);
9515 Ops.append(16 - Ops.size(), Undef);
9516 }
9517 assert(Ops.size() >= 8 && Ops.size() <= 12);
9518 SDValue MergedOps =
9519 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9520 Ops.clear();
9521 Ops.push_back(MergedOps);
9522 }
9523
9524 Ops.push_back(TDescr);
9525 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9526 Ops.push_back(M->getChain());
9527
9528 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9529 MachineMemOperand *MemRef = M->getMemOperand();
9530 DAG.setNodeMemRefs(NewNode, {MemRef});
9531 return SDValue(NewNode, 0);
9532 }
9533 case Intrinsic::amdgcn_global_atomic_fmin_num:
9534 case Intrinsic::amdgcn_global_atomic_fmax_num:
9535 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9536 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9537 MemSDNode *M = cast<MemSDNode>(Op);
9538 SDValue Ops[] = {
9539 M->getOperand(0), // Chain
9540 M->getOperand(2), // Ptr
9541 M->getOperand(3) // Value
9542 };
9543 unsigned Opcode = 0;
9544 switch (IntrID) {
9545 case Intrinsic::amdgcn_global_atomic_fmin_num:
9546 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9547 Opcode = ISD::ATOMIC_LOAD_FMIN;
9548 break;
9549 }
9550 case Intrinsic::amdgcn_global_atomic_fmax_num:
9551 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9552 Opcode = ISD::ATOMIC_LOAD_FMAX;
9553 break;
9554 }
9555 default:
9556 llvm_unreachable("unhandled atomic opcode");
9557 }
9558 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9559 Ops, M->getMemOperand());
9560 }
9561 case Intrinsic::amdgcn_s_get_barrier_state:
9562 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9563 SDValue Chain = Op->getOperand(0);
9565 unsigned Opc;
9566
9567 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9568 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9569 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9570 BarID = (BarID >> 4) & 0x3F;
9571 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9572 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9573 Ops.push_back(K);
9574 Ops.push_back(Chain);
9575 } else {
9576 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9577 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9578 SDValue M0Val;
9579 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9580 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9581 M0Val = SDValue(
9582 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9583 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9584 0);
9585 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9586 } else
9587 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9588 }
9589
9590 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9591 return SDValue(NewMI, 0);
9592 }
9593 default:
9594
9595 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9597 return lowerImage(Op, ImageDimIntr, DAG, true);
9598
9599 return SDValue();
9600 }
9601}
9602
9603// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9604// dwordx4 if on SI and handle TFE loads.
9605SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9606 SDVTList VTList,
9607 ArrayRef<SDValue> Ops, EVT MemVT,
9608 MachineMemOperand *MMO,
9609 SelectionDAG &DAG) const {
9610 LLVMContext &C = *DAG.getContext();
9612 EVT VT = VTList.VTs[0];
9613
9614 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9615 bool IsTFE = VTList.NumVTs == 3;
9616 if (IsTFE) {
9617 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9618 unsigned NumOpDWords = NumValueDWords + 1;
9619 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9620 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9621 MachineMemOperand *OpDWordsMMO =
9622 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9623 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9624 OpDWordsVT, OpDWordsMMO, DAG);
9626 DAG.getVectorIdxConstant(NumValueDWords, DL));
9627 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9628 SDValue ValueDWords =
9629 NumValueDWords == 1
9630 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9632 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9633 ZeroIdx);
9634 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9635 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9636 }
9637
9638 if (!Subtarget->hasDwordx3LoadStores() &&
9639 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9640 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9641 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9642 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9643 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9644 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9645 WidenedMemVT, WidenedMMO);
9647 DAG.getVectorIdxConstant(0, DL));
9648 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9649 }
9650
9651 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9652}
9653
9654SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9655 bool ImageStore) const {
9656 EVT StoreVT = VData.getValueType();
9657
9658 // No change for f16 and legal vector D16 types.
9659 if (!StoreVT.isVector())
9660 return VData;
9661
9662 SDLoc DL(VData);
9663 unsigned NumElements = StoreVT.getVectorNumElements();
9664
9665 if (Subtarget->hasUnpackedD16VMem()) {
9666 // We need to unpack the packed data to store.
9667 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9668 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9669
9670 EVT EquivStoreVT =
9671 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9672 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9673 return DAG.UnrollVectorOp(ZExt.getNode());
9674 }
9675
9676 // The sq block of gfx8.1 does not estimate register use correctly for d16
9677 // image store instructions. The data operand is computed as if it were not a
9678 // d16 image instruction.
9679 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9680 // Bitcast to i16
9681 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9682 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9683
9684 // Decompose into scalars
9686 DAG.ExtractVectorElements(IntVData, Elts);
9687
9688 // Group pairs of i16 into v2i16 and bitcast to i32
9689 SmallVector<SDValue, 4> PackedElts;
9690 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9691 SDValue Pair =
9692 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9693 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9694 PackedElts.push_back(IntPair);
9695 }
9696 if ((NumElements % 2) == 1) {
9697 // Handle v3i16
9698 unsigned I = Elts.size() / 2;
9699 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9700 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9701 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9702 PackedElts.push_back(IntPair);
9703 }
9704
9705 // Pad using UNDEF
9706 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9707
9708 // Build final vector
9709 EVT VecVT =
9710 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9711 return DAG.getBuildVector(VecVT, DL, PackedElts);
9712 }
9713
9714 if (NumElements == 3) {
9715 EVT IntStoreVT =
9717 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9718
9719 EVT WidenedStoreVT = EVT::getVectorVT(
9720 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9721 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9722 WidenedStoreVT.getStoreSizeInBits());
9723 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9724 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9725 }
9726
9727 assert(isTypeLegal(StoreVT));
9728 return VData;
9729}
9730
9731SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9732 SelectionDAG &DAG) const {
9733 SDLoc DL(Op);
9734 SDValue Chain = Op.getOperand(0);
9735 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9737
9738 switch (IntrinsicID) {
9739 case Intrinsic::amdgcn_exp_compr: {
9740 if (!Subtarget->hasCompressedExport()) {
9741 DiagnosticInfoUnsupported BadIntrin(
9743 "intrinsic not supported on subtarget", DL.getDebugLoc());
9744 DAG.getContext()->diagnose(BadIntrin);
9745 }
9746 SDValue Src0 = Op.getOperand(4);
9747 SDValue Src1 = Op.getOperand(5);
9748 // Hack around illegal type on SI by directly selecting it.
9749 if (isTypeLegal(Src0.getValueType()))
9750 return SDValue();
9751
9752 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9753 SDValue Undef = DAG.getUNDEF(MVT::f32);
9754 const SDValue Ops[] = {
9755 Op.getOperand(2), // tgt
9756 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9757 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9758 Undef, // src2
9759 Undef, // src3
9760 Op.getOperand(7), // vm
9761 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9762 Op.getOperand(3), // en
9763 Op.getOperand(0) // Chain
9764 };
9765
9766 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9767 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9768 }
9769 case Intrinsic::amdgcn_s_barrier:
9770 case Intrinsic::amdgcn_s_barrier_signal:
9771 case Intrinsic::amdgcn_s_barrier_wait: {
9774 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9775 if (WGSize <= ST.getWavefrontSize()) {
9776 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9777 // s_barrier/s_barrier_wait to wave_barrier.
9778 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9779 return Op.getOperand(0);
9780 else
9781 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9782 MVT::Other, Op.getOperand(0)),
9783 0);
9784 }
9785 }
9786
9787 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9788 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9789 SDValue K =
9791 SDValue BarSignal =
9792 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9793 MVT::Other, K, Op.getOperand(0)),
9794 0);
9795 SDValue BarWait =
9796 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9797 BarSignal.getValue(0)),
9798 0);
9799 return BarWait;
9800 }
9801
9802 return SDValue();
9803 };
9804
9805 case Intrinsic::amdgcn_struct_tbuffer_store:
9806 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9807 SDValue VData = Op.getOperand(2);
9808 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9809 if (IsD16)
9810 VData = handleD16VData(VData, DAG);
9811 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9812 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9813 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9814 SDValue Ops[] = {
9815 Chain,
9816 VData, // vdata
9817 Rsrc, // rsrc
9818 Op.getOperand(4), // vindex
9819 VOffset, // voffset
9820 SOffset, // soffset
9821 Offset, // offset
9822 Op.getOperand(7), // format
9823 Op.getOperand(8), // cachepolicy, swizzled buffer
9824 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9825 };
9826 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9828 MemSDNode *M = cast<MemSDNode>(Op);
9829 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9830 M->getMemoryVT(), M->getMemOperand());
9831 }
9832
9833 case Intrinsic::amdgcn_raw_tbuffer_store:
9834 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9835 SDValue VData = Op.getOperand(2);
9836 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9837 if (IsD16)
9838 VData = handleD16VData(VData, DAG);
9839 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9840 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9841 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9842 SDValue Ops[] = {
9843 Chain,
9844 VData, // vdata
9845 Rsrc, // rsrc
9846 DAG.getConstant(0, DL, MVT::i32), // vindex
9847 VOffset, // voffset
9848 SOffset, // soffset
9849 Offset, // offset
9850 Op.getOperand(6), // format
9851 Op.getOperand(7), // cachepolicy, swizzled buffer
9852 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9853 };
9854 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9856 MemSDNode *M = cast<MemSDNode>(Op);
9857 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9858 M->getMemoryVT(), M->getMemOperand());
9859 }
9860
9861 case Intrinsic::amdgcn_raw_buffer_store:
9862 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9863 case Intrinsic::amdgcn_raw_buffer_store_format:
9864 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9865 const bool IsFormat =
9866 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9867 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9868
9869 SDValue VData = Op.getOperand(2);
9870 EVT VDataVT = VData.getValueType();
9871 EVT EltType = VDataVT.getScalarType();
9872 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9873 if (IsD16) {
9874 VData = handleD16VData(VData, DAG);
9875 VDataVT = VData.getValueType();
9876 }
9877
9878 if (!isTypeLegal(VDataVT)) {
9879 VData =
9880 DAG.getNode(ISD::BITCAST, DL,
9881 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9882 }
9883
9884 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9885 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9886 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9887 SDValue Ops[] = {
9888 Chain,
9889 VData,
9890 Rsrc,
9891 DAG.getConstant(0, DL, MVT::i32), // vindex
9892 VOffset, // voffset
9893 SOffset, // soffset
9894 Offset, // offset
9895 Op.getOperand(6), // cachepolicy, swizzled buffer
9896 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9897 };
9898 unsigned Opc =
9900 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9901 MemSDNode *M = cast<MemSDNode>(Op);
9902
9903 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9904 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9905 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9906
9907 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9908 M->getMemoryVT(), M->getMemOperand());
9909 }
9910
9911 case Intrinsic::amdgcn_struct_buffer_store:
9912 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9913 case Intrinsic::amdgcn_struct_buffer_store_format:
9914 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9915 const bool IsFormat =
9916 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9917 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9918
9919 SDValue VData = Op.getOperand(2);
9920 EVT VDataVT = VData.getValueType();
9921 EVT EltType = VDataVT.getScalarType();
9922 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9923
9924 if (IsD16) {
9925 VData = handleD16VData(VData, DAG);
9926 VDataVT = VData.getValueType();
9927 }
9928
9929 if (!isTypeLegal(VDataVT)) {
9930 VData =
9931 DAG.getNode(ISD::BITCAST, DL,
9932 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9933 }
9934
9935 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9936 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9937 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9938 SDValue Ops[] = {
9939 Chain,
9940 VData,
9941 Rsrc,
9942 Op.getOperand(4), // vindex
9943 VOffset, // voffset
9944 SOffset, // soffset
9945 Offset, // offset
9946 Op.getOperand(7), // cachepolicy, swizzled buffer
9947 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9948 };
9949 unsigned Opc =
9951 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9952 MemSDNode *M = cast<MemSDNode>(Op);
9953
9954 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9955 EVT VDataType = VData.getValueType().getScalarType();
9956 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9957 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9958
9959 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9960 M->getMemoryVT(), M->getMemOperand());
9961 }
9962 case Intrinsic::amdgcn_raw_buffer_load_lds:
9963 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9964 case Intrinsic::amdgcn_struct_buffer_load_lds:
9965 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9966 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9967 unsigned Opc;
9968 bool HasVIndex =
9969 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9970 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9971 unsigned OpOffset = HasVIndex ? 1 : 0;
9972 SDValue VOffset = Op.getOperand(5 + OpOffset);
9973 bool HasVOffset = !isNullConstant(VOffset);
9974 unsigned Size = Op->getConstantOperandVal(4);
9975
9976 switch (Size) {
9977 default:
9978 return SDValue();
9979 case 1:
9980 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9981 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9982 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9983 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9984 break;
9985 case 2:
9986 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9987 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9988 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9989 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9990 break;
9991 case 4:
9992 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9993 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9994 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9995 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9996 break;
9997 case 12:
9998 if (!Subtarget->hasLDSLoadB96_B128())
9999 return SDValue();
10000 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10001 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10002 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10003 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10004 break;
10005 case 16:
10006 if (!Subtarget->hasLDSLoadB96_B128())
10007 return SDValue();
10008 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10009 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10010 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10011 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10012 break;
10013 }
10014
10015 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10016
10018
10019 if (HasVIndex && HasVOffset)
10020 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10021 {Op.getOperand(5), // VIndex
10022 VOffset}));
10023 else if (HasVIndex)
10024 Ops.push_back(Op.getOperand(5));
10025 else if (HasVOffset)
10026 Ops.push_back(VOffset);
10027
10028 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10029 Ops.push_back(Rsrc);
10030 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10031 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10032 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10033 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10035 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10036 DL, MVT::i8)); // cpol
10038 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10039 ? 1
10040 : 0,
10041 DL, MVT::i8)); // swz
10042 Ops.push_back(M0Val.getValue(0)); // Chain
10043 Ops.push_back(M0Val.getValue(1)); // Glue
10044
10045 auto *M = cast<MemSDNode>(Op);
10046 MachineMemOperand *LoadMMO = M->getMemOperand();
10047 // Don't set the offset value here because the pointer points to the base of
10048 // the buffer.
10049 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10050
10051 MachinePointerInfo StorePtrI = LoadPtrI;
10052 LoadPtrI.V = PoisonValue::get(
10056
10057 auto F = LoadMMO->getFlags() &
10059 LoadMMO =
10061 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10062
10064 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10065 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10066
10067 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10068 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10069
10070 return SDValue(Load, 0);
10071 }
10072 case Intrinsic::amdgcn_global_load_lds: {
10073 unsigned Opc;
10074 unsigned Size = Op->getConstantOperandVal(4);
10075 switch (Size) {
10076 default:
10077 return SDValue();
10078 case 1:
10079 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10080 break;
10081 case 2:
10082 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10083 break;
10084 case 4:
10085 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10086 break;
10087 case 12:
10088 if (!Subtarget->hasLDSLoadB96_B128())
10089 return SDValue();
10090 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10091 break;
10092 case 16:
10093 if (!Subtarget->hasLDSLoadB96_B128())
10094 return SDValue();
10095 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10096 break;
10097 }
10098
10099 auto *M = cast<MemSDNode>(Op);
10100 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10101
10103
10104 SDValue Addr = Op.getOperand(2); // Global ptr
10105 SDValue VOffset;
10106 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10107 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10108 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10109 SDValue LHS = Addr.getOperand(0);
10110 SDValue RHS = Addr.getOperand(1);
10111
10112 if (LHS->isDivergent())
10113 std::swap(LHS, RHS);
10114
10115 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10116 RHS.getOperand(0).getValueType() == MVT::i32) {
10117 // add (i64 sgpr), (zero_extend (i32 vgpr))
10118 Addr = LHS;
10119 VOffset = RHS.getOperand(0);
10120 }
10121 }
10122
10123 Ops.push_back(Addr);
10124 if (!Addr->isDivergent()) {
10125 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10126 if (!VOffset)
10127 VOffset =
10128 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10129 DAG.getTargetConstant(0, DL, MVT::i32)),
10130 0);
10131 Ops.push_back(VOffset);
10132 }
10133
10134 Ops.push_back(Op.getOperand(5)); // Offset
10135 Ops.push_back(Op.getOperand(6)); // CPol
10136 Ops.push_back(M0Val.getValue(0)); // Chain
10137 Ops.push_back(M0Val.getValue(1)); // Glue
10138
10139 MachineMemOperand *LoadMMO = M->getMemOperand();
10140 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10141 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10142 MachinePointerInfo StorePtrI = LoadPtrI;
10143 LoadPtrI.V = PoisonValue::get(
10147 auto F = LoadMMO->getFlags() &
10149 LoadMMO =
10151 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10153 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10154 LoadMMO->getAAInfo());
10155
10156 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10157 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10158
10159 return SDValue(Load, 0);
10160 }
10161 case Intrinsic::amdgcn_end_cf:
10162 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10163 Op->getOperand(2), Chain),
10164 0);
10165 case Intrinsic::amdgcn_s_barrier_init:
10166 case Intrinsic::amdgcn_s_barrier_signal_var: {
10167 // these two intrinsics have two operands: barrier pointer and member count
10168 SDValue Chain = Op->getOperand(0);
10170 SDValue BarOp = Op->getOperand(2);
10171 SDValue CntOp = Op->getOperand(3);
10172 SDValue M0Val;
10173 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10174 ? AMDGPU::S_BARRIER_INIT_M0
10175 : AMDGPU::S_BARRIER_SIGNAL_M0;
10176 // extract the BarrierID from bits 4-9 of BarOp
10177 SDValue BarID;
10178 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10179 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10180 BarID =
10181 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10182 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10183 0);
10184 // Member count should be put into M0[ShAmt:+6]
10185 // Barrier ID should be put into M0[5:0]
10186 M0Val =
10187 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10188 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10189 0);
10190 constexpr unsigned ShAmt = 16;
10191 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10192 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10193
10194 M0Val = SDValue(
10195 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10196
10197 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10198
10199 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10200 return SDValue(NewMI, 0);
10201 }
10202 case Intrinsic::amdgcn_s_barrier_join: {
10203 // these three intrinsics have one operand: barrier pointer
10204 SDValue Chain = Op->getOperand(0);
10206 SDValue BarOp = Op->getOperand(2);
10207 unsigned Opc;
10208
10209 if (isa<ConstantSDNode>(BarOp)) {
10210 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10211 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10212
10213 // extract the BarrierID from bits 4-9 of the immediate
10214 unsigned BarID = (BarVal >> 4) & 0x3F;
10215 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10216 Ops.push_back(K);
10217 Ops.push_back(Chain);
10218 } else {
10219 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10220
10221 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10222 SDValue M0Val;
10223 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10224 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10225 M0Val =
10226 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10227 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10228 0);
10229 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10230 }
10231
10232 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10233 return SDValue(NewMI, 0);
10234 }
10235 case Intrinsic::amdgcn_s_prefetch_data: {
10236 // For non-global address space preserve the chain and remove the call.
10237 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10238 return Op.getOperand(0);
10239 return Op;
10240 }
10241 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10242 SDValue Ops[] = {
10243 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10244 Op.getOperand(3), // offset
10245 Op.getOperand(4), // length
10246 };
10247
10248 MemSDNode *M = cast<MemSDNode>(Op);
10250 Op->getVTList(), Ops, M->getMemoryVT(),
10251 M->getMemOperand());
10252 }
10253 default: {
10254 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10256 return lowerImage(Op, ImageDimIntr, DAG, true);
10257
10258 return Op;
10259 }
10260 }
10261}
10262
10263// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10264// offset (the offset that is included in bounds checking and swizzling, to be
10265// split between the instruction's voffset and immoffset fields) and soffset
10266// (the offset that is excluded from bounds checking and swizzling, to go in
10267// the instruction's soffset field). This function takes the first kind of
10268// offset and figures out how to split it between voffset and immoffset.
10269std::pair<SDValue, SDValue>
10270SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10271 SDLoc DL(Offset);
10272 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10273 SDValue N0 = Offset;
10274 ConstantSDNode *C1 = nullptr;
10275
10276 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10277 N0 = SDValue();
10278 else if (DAG.isBaseWithConstantOffset(N0)) {
10279 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10280 N0 = N0.getOperand(0);
10281 }
10282
10283 if (C1) {
10284 unsigned ImmOffset = C1->getZExtValue();
10285 // If the immediate value is too big for the immoffset field, put only bits
10286 // that would normally fit in the immoffset field. The remaining value that
10287 // is copied/added for the voffset field is a large power of 2, and it
10288 // stands more chance of being CSEd with the copy/add for another similar
10289 // load/store.
10290 // However, do not do that rounding down if that is a negative
10291 // number, as it appears to be illegal to have a negative offset in the
10292 // vgpr, even if adding the immediate offset makes it positive.
10293 unsigned Overflow = ImmOffset & ~MaxImm;
10294 ImmOffset -= Overflow;
10295 if ((int32_t)Overflow < 0) {
10296 Overflow += ImmOffset;
10297 ImmOffset = 0;
10298 }
10299 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10300 if (Overflow) {
10301 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10302 if (!N0)
10303 N0 = OverflowVal;
10304 else {
10305 SDValue Ops[] = {N0, OverflowVal};
10306 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10307 }
10308 }
10309 }
10310 if (!N0)
10311 N0 = DAG.getConstant(0, DL, MVT::i32);
10312 if (!C1)
10313 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10314 return {N0, SDValue(C1, 0)};
10315}
10316
10317// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10318// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10319// pointed to by Offsets.
10320void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10321 SelectionDAG &DAG, SDValue *Offsets,
10322 Align Alignment) const {
10324 SDLoc DL(CombinedOffset);
10325 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10326 uint32_t Imm = C->getZExtValue();
10327 uint32_t SOffset, ImmOffset;
10328 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10329 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10330 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10331 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10332 return;
10333 }
10334 }
10335 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10336 SDValue N0 = CombinedOffset.getOperand(0);
10337 SDValue N1 = CombinedOffset.getOperand(1);
10338 uint32_t SOffset, ImmOffset;
10339 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10340 if (Offset >= 0 &&
10341 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10342 Offsets[0] = N0;
10343 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10344 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10345 return;
10346 }
10347 }
10348
10349 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10350 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10351 : DAG.getConstant(0, DL, MVT::i32);
10352
10353 Offsets[0] = CombinedOffset;
10354 Offsets[1] = SOffsetZero;
10355 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10356}
10357
10358SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10359 SelectionDAG &DAG) const {
10360 if (!MaybePointer.getValueType().isScalarInteger())
10361 return MaybePointer;
10362
10363 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10364 return Rsrc;
10365}
10366
10367// Wrap a global or flat pointer into a buffer intrinsic using the flags
10368// specified in the intrinsic.
10369SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10370 SelectionDAG &DAG) const {
10371 SDLoc Loc(Op);
10372
10373 SDValue Pointer = Op->getOperand(1);
10374 SDValue Stride = Op->getOperand(2);
10375 SDValue NumRecords = Op->getOperand(3);
10376 SDValue Flags = Op->getOperand(4);
10377
10378 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10379 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10380 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10381 std::optional<uint32_t> ConstStride = std::nullopt;
10382 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10383 ConstStride = ConstNode->getZExtValue();
10384
10385 SDValue NewHighHalf = Masked;
10386 if (!ConstStride || *ConstStride != 0) {
10387 SDValue ShiftedStride;
10388 if (ConstStride) {
10389 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10390 } else {
10391 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10392 ShiftedStride =
10393 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10394 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10395 }
10396 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10397 }
10398
10399 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10400 NewHighHalf, NumRecords, Flags);
10401 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10402 return RsrcPtr;
10403}
10404
10405// Handle 8 bit and 16 bit buffer loads
10406SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10407 EVT LoadVT, SDLoc DL,
10409 MachineMemOperand *MMO,
10410 bool IsTFE) const {
10411 EVT IntVT = LoadVT.changeTypeToInteger();
10412
10413 if (IsTFE) {
10414 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10418 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10419 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10420 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10422 DAG.getConstant(1, DL, MVT::i32));
10423 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10424 DAG.getConstant(0, DL, MVT::i32));
10425 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10426 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10427 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10428 }
10429
10430 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10433
10434 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10435 SDValue BufferLoad =
10436 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10437 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10438 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10439
10440 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10441}
10442
10443// Handle 8 bit and 16 bit buffer stores
10444SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10445 EVT VDataType, SDLoc DL,
10446 SDValue Ops[],
10447 MemSDNode *M) const {
10448 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10449 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10450
10451 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10452 Ops[1] = BufferStoreExt;
10453 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10455 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10456 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10457 M->getMemOperand());
10458}
10459
10461 SDValue Op, const SDLoc &SL, EVT VT) {
10462 if (VT.bitsLT(Op.getValueType()))
10463 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10464
10465 switch (ExtType) {
10466 case ISD::SEXTLOAD:
10467 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10468 case ISD::ZEXTLOAD:
10469 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10470 case ISD::EXTLOAD:
10471 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10472 case ISD::NON_EXTLOAD:
10473 return Op;
10474 }
10475
10476 llvm_unreachable("invalid ext type");
10477}
10478
10479// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10480// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10481SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10482 DAGCombinerInfo &DCI) const {
10483 SelectionDAG &DAG = DCI.DAG;
10484 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10485 return SDValue();
10486
10487 // FIXME: Constant loads should all be marked invariant.
10488 unsigned AS = Ld->getAddressSpace();
10489 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10491 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10492 return SDValue();
10493
10494 // Don't do this early, since it may interfere with adjacent load merging for
10495 // illegal types. We can avoid losing alignment information for exotic types
10496 // pre-legalize.
10497 EVT MemVT = Ld->getMemoryVT();
10498 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10499 MemVT.getSizeInBits() >= 32)
10500 return SDValue();
10501
10502 SDLoc SL(Ld);
10503
10504 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10505 "unexpected vector extload");
10506
10507 // TODO: Drop only high part of range.
10508 SDValue Ptr = Ld->getBasePtr();
10509 SDValue NewLoad = DAG.getLoad(
10510 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10511 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10512 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10513 nullptr); // Drop ranges
10514
10515 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10516 if (MemVT.isFloatingPoint()) {
10518 "unexpected fp extload");
10519 TruncVT = MemVT.changeTypeToInteger();
10520 }
10521
10522 SDValue Cvt = NewLoad;
10523 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10524 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10525 DAG.getValueType(TruncVT));
10526 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10528 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10529 } else {
10531 }
10532
10533 EVT VT = Ld->getValueType(0);
10534 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10535
10536 DCI.AddToWorklist(Cvt.getNode());
10537
10538 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10539 // the appropriate extension from the 32-bit load.
10540 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10541 DCI.AddToWorklist(Cvt.getNode());
10542
10543 // Handle conversion back to floating point if necessary.
10544 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10545
10546 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10547}
10548
10550 const SIMachineFunctionInfo &Info) {
10551 // TODO: Should check if the address can definitely not access stack.
10552 if (Info.isEntryFunction())
10553 return Info.getUserSGPRInfo().hasFlatScratchInit();
10554 return true;
10555}
10556
10557SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10558 SDLoc DL(Op);
10559 LoadSDNode *Load = cast<LoadSDNode>(Op);
10560 ISD::LoadExtType ExtType = Load->getExtensionType();
10561 EVT MemVT = Load->getMemoryVT();
10562 MachineMemOperand *MMO = Load->getMemOperand();
10563
10564 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10565 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10566 return SDValue();
10567
10568 // FIXME: Copied from PPC
10569 // First, load into 32 bits, then truncate to 1 bit.
10570
10571 SDValue Chain = Load->getChain();
10572 SDValue BasePtr = Load->getBasePtr();
10573
10574 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10575
10576 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10577 RealMemVT, MMO);
10578
10579 if (!MemVT.isVector()) {
10580 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10581 NewLD.getValue(1)};
10582
10583 return DAG.getMergeValues(Ops, DL);
10584 }
10585
10587 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10588 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10589 DAG.getConstant(I, DL, MVT::i32));
10590
10591 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10592 }
10593
10594 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10595
10596 return DAG.getMergeValues(Ops, DL);
10597 }
10598
10599 if (!MemVT.isVector())
10600 return SDValue();
10601
10602 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10603 "Custom lowering for non-i32 vectors hasn't been implemented.");
10604
10605 Align Alignment = Load->getAlign();
10606 unsigned AS = Load->getAddressSpace();
10607 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10608 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10609 return SplitVectorLoad(Op, DAG);
10610 }
10611
10614 // If there is a possibility that flat instruction access scratch memory
10615 // then we need to use the same legalization rules we use for private.
10616 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10618 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10621
10622 unsigned NumElements = MemVT.getVectorNumElements();
10623
10624 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10626 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10627 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10629 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10630 Alignment >= Align(4) && NumElements < 32) {
10631 if (MemVT.isPow2VectorType() ||
10632 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10633 return SDValue();
10634 return WidenOrSplitVectorLoad(Op, DAG);
10635 }
10636 // Non-uniform loads will be selected to MUBUF instructions, so they
10637 // have the same legalization requirements as global and private
10638 // loads.
10639 //
10640 }
10641 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10644 if (NumElements > 4)
10645 return SplitVectorLoad(Op, DAG);
10646 // v3 loads not supported on SI.
10647 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10648 return WidenOrSplitVectorLoad(Op, DAG);
10649
10650 // v3 and v4 loads are supported for private and global memory.
10651 return SDValue();
10652 }
10653 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10654 // Depending on the setting of the private_element_size field in the
10655 // resource descriptor, we can only make private accesses up to a certain
10656 // size.
10657 switch (Subtarget->getMaxPrivateElementSize()) {
10658 case 4: {
10659 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10660 return DAG.getMergeValues({Op0, Op1}, DL);
10661 }
10662 case 8:
10663 if (NumElements > 2)
10664 return SplitVectorLoad(Op, DAG);
10665 return SDValue();
10666 case 16:
10667 // Same as global/flat
10668 if (NumElements > 4)
10669 return SplitVectorLoad(Op, DAG);
10670 // v3 loads not supported on SI.
10671 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10672 return WidenOrSplitVectorLoad(Op, DAG);
10673
10674 return SDValue();
10675 default:
10676 llvm_unreachable("unsupported private_element_size");
10677 }
10678 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10679 unsigned Fast = 0;
10680 auto Flags = Load->getMemOperand()->getFlags();
10682 Load->getAlign(), Flags, &Fast) &&
10683 Fast > 1)
10684 return SDValue();
10685
10686 if (MemVT.isVector())
10687 return SplitVectorLoad(Op, DAG);
10688 }
10689
10691 MemVT, *Load->getMemOperand())) {
10692 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10693 return DAG.getMergeValues({Op0, Op1}, DL);
10694 }
10695
10696 return SDValue();
10697}
10698
10699SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10700 EVT VT = Op.getValueType();
10701 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10702 VT.getSizeInBits() == 512)
10703 return splitTernaryVectorOp(Op, DAG);
10704
10705 assert(VT.getSizeInBits() == 64);
10706
10707 SDLoc DL(Op);
10708 SDValue Cond = Op.getOperand(0);
10709
10710 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10711 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10712
10713 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10714 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10715
10716 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10717 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10718
10719 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10720
10721 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10722 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10723
10724 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10725
10726 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10727 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10728}
10729
10730// Catch division cases where we can use shortcuts with rcp and rsq
10731// instructions.
10732SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10733 SelectionDAG &DAG) const {
10734 SDLoc SL(Op);
10735 SDValue LHS = Op.getOperand(0);
10736 SDValue RHS = Op.getOperand(1);
10737 EVT VT = Op.getValueType();
10738 const SDNodeFlags Flags = Op->getFlags();
10739
10740 bool AllowInaccurateRcp =
10741 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10742
10743 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10744 // Without !fpmath accuracy information, we can't do more because we don't
10745 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10746 // f16 is always accurate enough
10747 if (!AllowInaccurateRcp && VT != MVT::f16)
10748 return SDValue();
10749
10750 if (CLHS->isExactlyValue(1.0)) {
10751 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10752 // the CI documentation has a worst case error of 1 ulp.
10753 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10754 // use it as long as we aren't trying to use denormals.
10755 //
10756 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10757
10758 // 1.0 / sqrt(x) -> rsq(x)
10759
10760 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10761 // error seems really high at 2^29 ULP.
10762 // 1.0 / x -> rcp(x)
10763 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10764 }
10765
10766 // Same as for 1.0, but expand the sign out of the constant.
10767 if (CLHS->isExactlyValue(-1.0)) {
10768 // -1.0 / x -> rcp (fneg x)
10769 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10770 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10771 }
10772 }
10773
10774 // For f16 require afn or arcp.
10775 // For f32 require afn.
10776 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10777 return SDValue();
10778
10779 // Turn into multiply by the reciprocal.
10780 // x / y -> x * (1.0 / y)
10781 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10782 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10783}
10784
10785SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10786 SelectionDAG &DAG) const {
10787 SDLoc SL(Op);
10788 SDValue X = Op.getOperand(0);
10789 SDValue Y = Op.getOperand(1);
10790 EVT VT = Op.getValueType();
10791 const SDNodeFlags Flags = Op->getFlags();
10792
10793 bool AllowInaccurateDiv =
10794 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10795 if (!AllowInaccurateDiv)
10796 return SDValue();
10797
10798 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10799 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10800
10801 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10802 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10803
10804 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10805 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10806 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10807 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10808 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10809 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10810}
10811
10812static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10813 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10814 SDNodeFlags Flags) {
10815 if (GlueChain->getNumValues() <= 1) {
10816 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10817 }
10818
10819 assert(GlueChain->getNumValues() == 3);
10820
10821 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10822 switch (Opcode) {
10823 default:
10824 llvm_unreachable("no chain equivalent for opcode");
10825 case ISD::FMUL:
10826 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10827 break;
10828 }
10829
10830 return DAG.getNode(Opcode, SL, VTList,
10831 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10832 Flags);
10833}
10834
10835static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10836 EVT VT, SDValue A, SDValue B, SDValue C,
10837 SDValue GlueChain, SDNodeFlags Flags) {
10838 if (GlueChain->getNumValues() <= 1) {
10839 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10840 }
10841
10842 assert(GlueChain->getNumValues() == 3);
10843
10844 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10845 switch (Opcode) {
10846 default:
10847 llvm_unreachable("no chain equivalent for opcode");
10848 case ISD::FMA:
10849 Opcode = AMDGPUISD::FMA_W_CHAIN;
10850 break;
10851 }
10852
10853 return DAG.getNode(Opcode, SL, VTList,
10854 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10855 Flags);
10856}
10857
10858SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10859 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10860 return FastLowered;
10861
10862 SDLoc SL(Op);
10863 SDValue LHS = Op.getOperand(0);
10864 SDValue RHS = Op.getOperand(1);
10865
10866 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10867 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10868 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10869 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10870 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10871 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10872 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10873 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10874 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10875 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10876 // q16.u = opx(V_CVT_F16_F32, q32.u);
10877 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10878
10879 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10880 unsigned FMADOpCode =
10882
10883 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10884 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10885 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10886 SDValue Rcp =
10887 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10888 SDValue Quot =
10889 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10890 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10891 Op->getFlags());
10892 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10893 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10894 Op->getFlags());
10895 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10896 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10897 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10898 DAG.getConstant(0xff800000, SL, MVT::i32));
10899 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10900 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10901 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10902 DAG.getTargetConstant(0, SL, MVT::i32));
10903 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10904 Op->getFlags());
10905}
10906
10907// Faster 2.5 ULP division that does not support denormals.
10908SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10909 SDNodeFlags Flags = Op->getFlags();
10910 SDLoc SL(Op);
10911 SDValue LHS = Op.getOperand(1);
10912 SDValue RHS = Op.getOperand(2);
10913
10914 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10915
10916 const APFloat K0Val(0x1p+96f);
10917 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10918
10919 const APFloat K1Val(0x1p-32f);
10920 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10921
10922 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10923
10924 EVT SetCCVT =
10925 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10926
10927 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10928
10929 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10930
10931 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10932
10933 // rcp does not support denormals.
10934 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10935
10936 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10937
10938 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10939}
10940
10941// Returns immediate value for setting the F32 denorm mode when using the
10942// S_DENORM_MODE instruction.
10944 const SIMachineFunctionInfo *Info,
10945 const GCNSubtarget *ST) {
10946 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10947 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10948 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10949 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10950}
10951
10952SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10953 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10954 return FastLowered;
10955
10956 // The selection matcher assumes anything with a chain selecting to a
10957 // mayRaiseFPException machine instruction. Since we're introducing a chain
10958 // here, we need to explicitly report nofpexcept for the regular fdiv
10959 // lowering.
10960 SDNodeFlags Flags = Op->getFlags();
10961 Flags.setNoFPExcept(true);
10962
10963 SDLoc SL(Op);
10964 SDValue LHS = Op.getOperand(0);
10965 SDValue RHS = Op.getOperand(1);
10966
10967 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10968
10969 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10970
10971 SDValue DenominatorScaled =
10972 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10973 SDValue NumeratorScaled =
10974 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10975
10976 // Denominator is scaled to not be denormal, so using rcp is ok.
10977 SDValue ApproxRcp =
10978 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10979 SDValue NegDivScale0 =
10980 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10981
10982 using namespace AMDGPU::Hwreg;
10983 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10984 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10985
10986 const MachineFunction &MF = DAG.getMachineFunction();
10988 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10989
10990 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10991 const bool HasDynamicDenormals =
10992 (DenormMode.Input == DenormalMode::Dynamic) ||
10993 (DenormMode.Output == DenormalMode::Dynamic);
10994
10995 SDValue SavedDenormMode;
10996
10997 if (!PreservesDenormals) {
10998 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10999 // lowering. The chain dependence is insufficient, and we need glue. We do
11000 // not need the glue variants in a strictfp function.
11001
11002 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11003
11004 SDValue Glue = DAG.getEntryNode();
11005 if (HasDynamicDenormals) {
11006 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11007 DAG.getVTList(MVT::i32, MVT::Glue),
11008 {BitField, Glue});
11009 SavedDenormMode = SDValue(GetReg, 0);
11010
11011 Glue = DAG.getMergeValues(
11012 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11013 }
11014
11015 SDNode *EnableDenorm;
11016 if (Subtarget->hasDenormModeInst()) {
11017 const SDValue EnableDenormValue =
11018 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
11019
11020 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11021 EnableDenormValue)
11022 .getNode();
11023 } else {
11024 const SDValue EnableDenormValue =
11025 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11026 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11027 {EnableDenormValue, BitField, Glue});
11028 }
11029
11030 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11031 SDValue(EnableDenorm, 1)};
11032
11033 NegDivScale0 = DAG.getMergeValues(Ops, SL);
11034 }
11035
11036 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11037 ApproxRcp, One, NegDivScale0, Flags);
11038
11039 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11040 ApproxRcp, Fma0, Flags);
11041
11042 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11043 Fma1, Flags);
11044
11045 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11046 NumeratorScaled, Mul, Flags);
11047
11048 SDValue Fma3 =
11049 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11050
11051 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11052 NumeratorScaled, Fma3, Flags);
11053
11054 if (!PreservesDenormals) {
11055 SDNode *DisableDenorm;
11056 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11057 const SDValue DisableDenormValue = getSPDenormModeValue(
11058 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11059
11060 DisableDenorm =
11061 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
11062 DisableDenormValue, Fma4.getValue(2))
11063 .getNode();
11064 } else {
11065 assert(HasDynamicDenormals == (bool)SavedDenormMode);
11066 const SDValue DisableDenormValue =
11067 HasDynamicDenormals
11068 ? SavedDenormMode
11069 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11070
11071 DisableDenorm = DAG.getMachineNode(
11072 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11073 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11074 }
11075
11076 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11077 SDValue(DisableDenorm, 0), DAG.getRoot());
11078 DAG.setRoot(OutputChain);
11079 }
11080
11081 SDValue Scale = NumeratorScaled.getValue(1);
11082 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11083 {Fma4, Fma1, Fma3, Scale}, Flags);
11084
11085 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11086}
11087
11088SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11089 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11090 return FastLowered;
11091
11092 SDLoc SL(Op);
11093 SDValue X = Op.getOperand(0);
11094 SDValue Y = Op.getOperand(1);
11095
11096 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11097
11098 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11099
11100 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11101
11102 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11103
11104 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11105
11106 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11107
11108 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11109
11110 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11111
11112 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11113
11114 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11115 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11116
11117 SDValue Fma4 =
11118 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11119
11120 SDValue Scale;
11121
11122 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11123 // Workaround a hardware bug on SI where the condition output from div_scale
11124 // is not usable.
11125
11126 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11127
11128 // Figure out if the scale to use for div_fmas.
11129 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11130 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11131 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11132 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11133
11134 SDValue NumHi =
11135 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11136 SDValue DenHi =
11137 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11138
11139 SDValue Scale0Hi =
11140 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11141 SDValue Scale1Hi =
11142 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11143
11144 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11145 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11146 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11147 } else {
11148 Scale = DivScale1.getValue(1);
11149 }
11150
11151 SDValue Fmas =
11152 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11153
11154 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11155}
11156
11157SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11158 EVT VT = Op.getValueType();
11159
11160 if (VT == MVT::f32)
11161 return LowerFDIV32(Op, DAG);
11162
11163 if (VT == MVT::f64)
11164 return LowerFDIV64(Op, DAG);
11165
11166 if (VT == MVT::f16)
11167 return LowerFDIV16(Op, DAG);
11168
11169 llvm_unreachable("Unexpected type for fdiv");
11170}
11171
11172SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11173 SDLoc dl(Op);
11174 SDValue Val = Op.getOperand(0);
11175 EVT VT = Val.getValueType();
11176 EVT ResultExpVT = Op->getValueType(1);
11177 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11178
11179 SDValue Mant = DAG.getNode(
11181 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11182
11183 SDValue Exp = DAG.getNode(
11184 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11185 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11186
11187 if (Subtarget->hasFractBug()) {
11188 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11189 SDValue Inf =
11191
11192 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11193 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11194 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11195 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11196 }
11197
11198 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11199 return DAG.getMergeValues({Mant, CastExp}, dl);
11200}
11201
11202SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11203 SDLoc DL(Op);
11204 StoreSDNode *Store = cast<StoreSDNode>(Op);
11205 EVT VT = Store->getMemoryVT();
11206
11207 if (VT == MVT::i1) {
11208 return DAG.getTruncStore(
11209 Store->getChain(), DL,
11210 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11211 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11212 }
11213
11214 assert(VT.isVector() &&
11215 Store->getValue().getValueType().getScalarType() == MVT::i32);
11216
11217 unsigned AS = Store->getAddressSpace();
11218 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11219 Store->getAlign().value() < VT.getStoreSize() &&
11220 VT.getSizeInBits() > 32) {
11221 return SplitVectorStore(Op, DAG);
11222 }
11223
11226 // If there is a possibility that flat instruction access scratch memory
11227 // then we need to use the same legalization rules we use for private.
11228 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11230 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11233
11234 unsigned NumElements = VT.getVectorNumElements();
11236 if (NumElements > 4)
11237 return SplitVectorStore(Op, DAG);
11238 // v3 stores not supported on SI.
11239 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11240 return SplitVectorStore(Op, DAG);
11241
11243 VT, *Store->getMemOperand()))
11244 return expandUnalignedStore(Store, DAG);
11245
11246 return SDValue();
11247 }
11248 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11249 switch (Subtarget->getMaxPrivateElementSize()) {
11250 case 4:
11251 return scalarizeVectorStore(Store, DAG);
11252 case 8:
11253 if (NumElements > 2)
11254 return SplitVectorStore(Op, DAG);
11255 return SDValue();
11256 case 16:
11257 if (NumElements > 4 ||
11258 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11259 return SplitVectorStore(Op, DAG);
11260 return SDValue();
11261 default:
11262 llvm_unreachable("unsupported private_element_size");
11263 }
11264 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11265 unsigned Fast = 0;
11266 auto Flags = Store->getMemOperand()->getFlags();
11268 Store->getAlign(), Flags, &Fast) &&
11269 Fast > 1)
11270 return SDValue();
11271
11272 if (VT.isVector())
11273 return SplitVectorStore(Op, DAG);
11274
11275 return expandUnalignedStore(Store, DAG);
11276 }
11277
11278 // Probably an invalid store. If so we'll end up emitting a selection error.
11279 return SDValue();
11280}
11281
11282// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11283SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11284 SDLoc SL(Op);
11285 assert(!Subtarget->has16BitInsts());
11286 SDNodeFlags Flags = Op->getFlags();
11287 SDValue Ext =
11288 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11289
11290 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11291 SDValue Sqrt =
11292 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11293
11294 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11295 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11296}
11297
11298SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11299 SDLoc DL(Op);
11300 SDNodeFlags Flags = Op->getFlags();
11301 MVT VT = Op.getValueType().getSimpleVT();
11302 const SDValue X = Op.getOperand(0);
11303
11304 if (allowApproxFunc(DAG, Flags)) {
11305 // Instruction is 1ulp but ignores denormals.
11306 return DAG.getNode(
11308 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11309 }
11310
11311 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11312 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11313
11314 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11315
11316 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11317
11318 SDValue SqrtX =
11319 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11320
11321 SDValue SqrtS;
11322 if (needsDenormHandlingF32(DAG, X, Flags)) {
11323 SDValue SqrtID =
11324 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11325 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11326
11327 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11328 SDValue SqrtSNextDownInt =
11329 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11330 DAG.getAllOnesConstant(DL, MVT::i32));
11331 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11332
11333 SDValue NegSqrtSNextDown =
11334 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11335
11336 SDValue SqrtVP =
11337 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11338
11339 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11340 DAG.getConstant(1, DL, MVT::i32));
11341 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11342
11343 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11344 SDValue SqrtVS =
11345 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11346
11347 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11348 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11349
11350 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11351 Flags);
11352
11353 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11354 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11355 Flags);
11356 } else {
11357 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11358
11359 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11360
11361 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11362 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11363 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11364
11365 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11366 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11367 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11368
11369 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11370 SDValue SqrtD =
11371 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11372 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11373 }
11374
11375 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11376
11377 SDValue ScaledDown =
11378 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11379
11380 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11381 SDValue IsZeroOrInf =
11382 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11383 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11384
11385 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11386}
11387
11388SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11389 // For double type, the SQRT and RSQ instructions don't have required
11390 // precision, we apply Goldschmidt's algorithm to improve the result:
11391 //
11392 // y0 = rsq(x)
11393 // g0 = x * y0
11394 // h0 = 0.5 * y0
11395 //
11396 // r0 = 0.5 - h0 * g0
11397 // g1 = g0 * r0 + g0
11398 // h1 = h0 * r0 + h0
11399 //
11400 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11401 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11402 // h2 = h1 * r1 + h1
11403 //
11404 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11405 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11406 //
11407 // sqrt(x) = g3
11408
11409 SDNodeFlags Flags = Op->getFlags();
11410
11411 SDLoc DL(Op);
11412
11413 SDValue X = Op.getOperand(0);
11414 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11415
11416 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11417
11418 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11419
11420 // Scale up input if it is too small.
11421 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11422 SDValue ScaleUp =
11423 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11424 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11425
11426 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11427
11428 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11429
11430 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11431 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11432
11433 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11434 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11435
11436 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11437
11438 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11439
11440 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11441 SDValue SqrtD0 =
11442 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11443
11444 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11445
11446 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11447 SDValue SqrtD1 =
11448 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11449
11450 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11451
11452 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11453 SDValue ScaleDown =
11454 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11455 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11456
11457 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11458 // with finite only or nsz because rsq(+/-0) = +/-inf
11459
11460 // TODO: Check for DAZ and expand to subnormals
11461 SDValue IsZeroOrInf =
11462 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11463 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11464
11465 // If x is +INF, +0, or -0, use its original value
11466 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11467 Flags);
11468}
11469
11470SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11471 SDLoc DL(Op);
11472 EVT VT = Op.getValueType();
11473 SDValue Arg = Op.getOperand(0);
11474 SDValue TrigVal;
11475
11476 // Propagate fast-math flags so that the multiply we introduce can be folded
11477 // if Arg is already the result of a multiply by constant.
11478 auto Flags = Op->getFlags();
11479
11480 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11481
11482 if (Subtarget->hasTrigReducedRange()) {
11483 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11484 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11485 } else {
11486 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11487 }
11488
11489 switch (Op.getOpcode()) {
11490 case ISD::FCOS:
11491 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11492 case ISD::FSIN:
11493 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11494 default:
11495 llvm_unreachable("Wrong trig opcode");
11496 }
11497}
11498
11499SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11500 SelectionDAG &DAG) const {
11501 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11502 assert(AtomicNode->isCompareAndSwap());
11503 unsigned AS = AtomicNode->getAddressSpace();
11504
11505 // No custom lowering required for local address space
11507 return Op;
11508
11509 // Non-local address space requires custom lowering for atomic compare
11510 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11511 SDLoc DL(Op);
11512 SDValue ChainIn = Op.getOperand(0);
11513 SDValue Addr = Op.getOperand(1);
11514 SDValue Old = Op.getOperand(2);
11515 SDValue New = Op.getOperand(3);
11516 EVT VT = Op.getValueType();
11517 MVT SimpleVT = VT.getSimpleVT();
11518 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11519
11520 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11521 SDValue Ops[] = {ChainIn, Addr, NewOld};
11522
11524 Op->getVTList(), Ops, VT,
11525 AtomicNode->getMemOperand());
11526}
11527
11528//===----------------------------------------------------------------------===//
11529// Custom DAG optimizations
11530//===----------------------------------------------------------------------===//
11531
11532SDValue
11533SITargetLowering::performUCharToFloatCombine(SDNode *N,
11534 DAGCombinerInfo &DCI) const {
11535 EVT VT = N->getValueType(0);
11536 EVT ScalarVT = VT.getScalarType();
11537 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11538 return SDValue();
11539
11540 SelectionDAG &DAG = DCI.DAG;
11541 SDLoc DL(N);
11542
11543 SDValue Src = N->getOperand(0);
11544 EVT SrcVT = Src.getValueType();
11545
11546 // TODO: We could try to match extracting the higher bytes, which would be
11547 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11548 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11549 // about in practice.
11550 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11551 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11552 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11553 DCI.AddToWorklist(Cvt.getNode());
11554
11555 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11556 if (ScalarVT != MVT::f32) {
11557 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11558 DAG.getTargetConstant(0, DL, MVT::i32));
11559 }
11560 return Cvt;
11561 }
11562 }
11563
11564 return SDValue();
11565}
11566
11567SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11568 DAGCombinerInfo &DCI) const {
11569 SDValue MagnitudeOp = N->getOperand(0);
11570 SDValue SignOp = N->getOperand(1);
11571 SelectionDAG &DAG = DCI.DAG;
11572 SDLoc DL(N);
11573
11574 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11575 // lower half with a copy.
11576 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11577 if (MagnitudeOp.getValueType() == MVT::f64) {
11578 SDValue MagAsVector =
11579 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11580 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11581 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11582 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11583 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11584
11585 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11586
11587 SDValue Vector =
11588 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11589
11590 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11591 }
11592
11593 if (SignOp.getValueType() != MVT::f64)
11594 return SDValue();
11595
11596 // Reduce width of sign operand, we only need the highest bit.
11597 //
11598 // fcopysign f64:x, f64:y ->
11599 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11600 // TODO: In some cases it might make sense to go all the way to f16.
11601 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11602 SDValue SignAsF32 =
11603 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11604 DAG.getConstant(1, DL, MVT::i32));
11605
11606 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11607 SignAsF32);
11608}
11609
11610// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11611// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11612// bits
11613
11614// This is a variant of
11615// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11616//
11617// The normal DAG combiner will do this, but only if the add has one use since
11618// that would increase the number of instructions.
11619//
11620// This prevents us from seeing a constant offset that can be folded into a
11621// memory instruction's addressing mode. If we know the resulting add offset of
11622// a pointer can be folded into an addressing offset, we can replace the pointer
11623// operand with the add of new constant offset. This eliminates one of the uses,
11624// and may allow the remaining use to also be simplified.
11625//
11626SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11627 EVT MemVT,
11628 DAGCombinerInfo &DCI) const {
11629 SDValue N0 = N->getOperand(0);
11630 SDValue N1 = N->getOperand(1);
11631
11632 // We only do this to handle cases where it's profitable when there are
11633 // multiple uses of the add, so defer to the standard combine.
11634 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11635 N0->hasOneUse())
11636 return SDValue();
11637
11638 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11639 if (!CN1)
11640 return SDValue();
11641
11642 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11643 if (!CAdd)
11644 return SDValue();
11645
11646 SelectionDAG &DAG = DCI.DAG;
11647
11648 if (N0->getOpcode() == ISD::OR &&
11649 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11650 return SDValue();
11651
11652 // If the resulting offset is too large, we can't fold it into the
11653 // addressing mode offset.
11654 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11655 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11656
11657 AddrMode AM;
11658 AM.HasBaseReg = true;
11659 AM.BaseOffs = Offset.getSExtValue();
11660 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11661 return SDValue();
11662
11663 SDLoc SL(N);
11664 EVT VT = N->getValueType(0);
11665
11666 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11667 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11668
11670 Flags.setNoUnsignedWrap(
11671 N->getFlags().hasNoUnsignedWrap() &&
11672 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11673
11674 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11675}
11676
11677/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11678/// by the chain and intrinsic ID. Theoretically we would also need to check the
11679/// specific intrinsic, but they all place the pointer operand first.
11680static unsigned getBasePtrIndex(const MemSDNode *N) {
11681 switch (N->getOpcode()) {
11682 case ISD::STORE:
11685 return 2;
11686 default:
11687 return 1;
11688 }
11689}
11690
11691SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11692 DAGCombinerInfo &DCI) const {
11693 SelectionDAG &DAG = DCI.DAG;
11694 SDLoc SL(N);
11695
11696 unsigned PtrIdx = getBasePtrIndex(N);
11697 SDValue Ptr = N->getOperand(PtrIdx);
11698
11699 // TODO: We could also do this for multiplies.
11700 if (Ptr.getOpcode() == ISD::SHL) {
11701 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11702 N->getMemoryVT(), DCI);
11703 if (NewPtr) {
11704 SmallVector<SDValue, 8> NewOps(N->ops());
11705
11706 NewOps[PtrIdx] = NewPtr;
11707 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11708 }
11709 }
11710
11711 return SDValue();
11712}
11713
11714static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11715 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11716 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11717 (Opc == ISD::XOR && Val == 0);
11718}
11719
11720// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11721// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11722// integer combine opportunities since most 64-bit operations are decomposed
11723// this way. TODO: We won't want this for SALU especially if it is an inline
11724// immediate.
11725SDValue SITargetLowering::splitBinaryBitConstantOp(
11726 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11727 const ConstantSDNode *CRHS) const {
11728 uint64_t Val = CRHS->getZExtValue();
11729 uint32_t ValLo = Lo_32(Val);
11730 uint32_t ValHi = Hi_32(Val);
11732
11733 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11734 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11735 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11736 // If we need to materialize a 64-bit immediate, it will be split up later
11737 // anyway. Avoid creating the harder to understand 64-bit immediate
11738 // materialization.
11739 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11740 }
11741
11742 return SDValue();
11743}
11744
11746 if (V.getValueType() != MVT::i1)
11747 return false;
11748 switch (V.getOpcode()) {
11749 default:
11750 break;
11751 case ISD::SETCC:
11753 return true;
11754 case ISD::AND:
11755 case ISD::OR:
11756 case ISD::XOR:
11757 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11758 }
11759 return false;
11760}
11761
11762// If a constant has all zeroes or all ones within each byte return it.
11763// Otherwise return 0.
11765 // 0xff for any zero byte in the mask
11766 uint32_t ZeroByteMask = 0;
11767 if (!(C & 0x000000ff))
11768 ZeroByteMask |= 0x000000ff;
11769 if (!(C & 0x0000ff00))
11770 ZeroByteMask |= 0x0000ff00;
11771 if (!(C & 0x00ff0000))
11772 ZeroByteMask |= 0x00ff0000;
11773 if (!(C & 0xff000000))
11774 ZeroByteMask |= 0xff000000;
11775 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11776 if ((NonZeroByteMask & C) != NonZeroByteMask)
11777 return 0; // Partial bytes selected.
11778 return C;
11779}
11780
11781// Check if a node selects whole bytes from its operand 0 starting at a byte
11782// boundary while masking the rest. Returns select mask as in the v_perm_b32
11783// or -1 if not succeeded.
11784// Note byte select encoding:
11785// value 0-3 selects corresponding source byte;
11786// value 0xc selects zero;
11787// value 0xff selects 0xff.
11789 assert(V.getValueSizeInBits() == 32);
11790
11791 if (V.getNumOperands() != 2)
11792 return ~0;
11793
11794 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11795 if (!N1)
11796 return ~0;
11797
11798 uint32_t C = N1->getZExtValue();
11799
11800 switch (V.getOpcode()) {
11801 default:
11802 break;
11803 case ISD::AND:
11804 if (uint32_t ConstMask = getConstantPermuteMask(C))
11805 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11806 break;
11807
11808 case ISD::OR:
11809 if (uint32_t ConstMask = getConstantPermuteMask(C))
11810 return (0x03020100 & ~ConstMask) | ConstMask;
11811 break;
11812
11813 case ISD::SHL:
11814 if (C % 8)
11815 return ~0;
11816
11817 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11818
11819 case ISD::SRL:
11820 if (C % 8)
11821 return ~0;
11822
11823 return uint32_t(0x0c0c0c0c03020100ull >> C);
11824 }
11825
11826 return ~0;
11827}
11828
11829SDValue SITargetLowering::performAndCombine(SDNode *N,
11830 DAGCombinerInfo &DCI) const {
11831 if (DCI.isBeforeLegalize())
11832 return SDValue();
11833
11834 SelectionDAG &DAG = DCI.DAG;
11835 EVT VT = N->getValueType(0);
11836 SDValue LHS = N->getOperand(0);
11837 SDValue RHS = N->getOperand(1);
11838
11839 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11840 if (VT == MVT::i64 && CRHS) {
11841 if (SDValue Split =
11842 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11843 return Split;
11844 }
11845
11846 if (CRHS && VT == MVT::i32) {
11847 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11848 // nb = number of trailing zeroes in mask
11849 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11850 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11851 uint64_t Mask = CRHS->getZExtValue();
11852 unsigned Bits = llvm::popcount(Mask);
11853 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11854 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11855 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11856 unsigned Shift = CShift->getZExtValue();
11857 unsigned NB = CRHS->getAPIntValue().countr_zero();
11858 unsigned Offset = NB + Shift;
11859 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11860 SDLoc SL(N);
11861 SDValue BFE =
11862 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11863 DAG.getConstant(Offset, SL, MVT::i32),
11864 DAG.getConstant(Bits, SL, MVT::i32));
11865 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11866 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11867 DAG.getValueType(NarrowVT));
11868 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11869 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11870 return Shl;
11871 }
11872 }
11873 }
11874
11875 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11876 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11877 isa<ConstantSDNode>(LHS.getOperand(2))) {
11878 uint32_t Sel = getConstantPermuteMask(Mask);
11879 if (!Sel)
11880 return SDValue();
11881
11882 // Select 0xc for all zero bytes
11883 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11884 SDLoc DL(N);
11885 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11886 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11887 }
11888 }
11889
11890 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11891 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11892 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11893 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11894 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11895
11896 SDValue X = LHS.getOperand(0);
11897 SDValue Y = RHS.getOperand(0);
11898 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11899 !isTypeLegal(X.getValueType()))
11900 return SDValue();
11901
11902 if (LCC == ISD::SETO) {
11903 if (X != LHS.getOperand(1))
11904 return SDValue();
11905
11906 if (RCC == ISD::SETUNE) {
11907 const ConstantFPSDNode *C1 =
11908 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11909 if (!C1 || !C1->isInfinity() || C1->isNegative())
11910 return SDValue();
11911
11916
11917 static_assert(
11920 0x3ff) == Mask,
11921 "mask not equal");
11922
11923 SDLoc DL(N);
11924 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11925 DAG.getConstant(Mask, DL, MVT::i32));
11926 }
11927 }
11928 }
11929
11930 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11931 std::swap(LHS, RHS);
11932
11933 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11934 RHS.hasOneUse()) {
11935 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11936 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11937 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11938 // | n_nan)
11939 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11940 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11941 (RHS.getOperand(0) == LHS.getOperand(0) &&
11942 LHS.getOperand(0) == LHS.getOperand(1))) {
11943 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11944 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11945 : Mask->getZExtValue() & OrdMask;
11946
11947 SDLoc DL(N);
11948 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11949 DAG.getConstant(NewMask, DL, MVT::i32));
11950 }
11951 }
11952
11953 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11954 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11955 // and x, (sext cc from i1) => select cc, x, 0
11956 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11957 std::swap(LHS, RHS);
11958 if (isBoolSGPR(RHS.getOperand(0)))
11959 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11960 DAG.getConstant(0, SDLoc(N), MVT::i32));
11961 }
11962
11963 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11965 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11966 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11967 uint32_t LHSMask = getPermuteMask(LHS);
11968 uint32_t RHSMask = getPermuteMask(RHS);
11969 if (LHSMask != ~0u && RHSMask != ~0u) {
11970 // Canonicalize the expression in an attempt to have fewer unique masks
11971 // and therefore fewer registers used to hold the masks.
11972 if (LHSMask > RHSMask) {
11973 std::swap(LHSMask, RHSMask);
11974 std::swap(LHS, RHS);
11975 }
11976
11977 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11978 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11979 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11980 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11981
11982 // Check of we need to combine values from two sources within a byte.
11983 if (!(LHSUsedLanes & RHSUsedLanes) &&
11984 // If we select high and lower word keep it for SDWA.
11985 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11986 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11987 // Each byte in each mask is either selector mask 0-3, or has higher
11988 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11989 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11990 // mask which is not 0xff wins. By anding both masks we have a correct
11991 // result except that 0x0c shall be corrected to give 0x0c only.
11992 uint32_t Mask = LHSMask & RHSMask;
11993 for (unsigned I = 0; I < 32; I += 8) {
11994 uint32_t ByteSel = 0xff << I;
11995 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11996 Mask &= (0x0c << I) & 0xffffffff;
11997 }
11998
11999 // Add 4 to each active LHS lane. It will not affect any existing 0xff
12000 // or 0x0c.
12001 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12002 SDLoc DL(N);
12003
12004 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12005 RHS.getOperand(0),
12006 DAG.getConstant(Sel, DL, MVT::i32));
12007 }
12008 }
12009 }
12010
12011 return SDValue();
12012}
12013
12014// A key component of v_perm is a mapping between byte position of the src
12015// operands, and the byte position of the dest. To provide such, we need: 1. the
12016// node that provides x byte of the dest of the OR, and 2. the byte of the node
12017// used to provide that x byte. calculateByteProvider finds which node provides
12018// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12019// and finds an ultimate src and byte position For example: The supported
12020// LoadCombine pattern for vector loads is as follows
12021// t1
12022// or
12023// / \
12024// t2 t3
12025// zext shl
12026// | | \
12027// t4 t5 16
12028// or anyext
12029// / \ |
12030// t6 t7 t8
12031// srl shl or
12032// / | / \ / \
12033// t9 t10 t11 t12 t13 t14
12034// trunc* 8 trunc* 8 and and
12035// | | / | | \
12036// t15 t16 t17 t18 t19 t20
12037// trunc* 255 srl -256
12038// | / \
12039// t15 t15 16
12040//
12041// *In this example, the truncs are from i32->i16
12042//
12043// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12044// respectively. calculateSrcByte would find (given node) -> ultimate src &
12045// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12046// After finding the mapping, we can combine the tree into vperm t15, t16,
12047// 0x05000407
12048
12049// Find the source and byte position from a node.
12050// \p DestByte is the byte position of the dest of the or that the src
12051// ultimately provides. \p SrcIndex is the byte of the src that maps to this
12052// dest of the or byte. \p Depth tracks how many recursive iterations we have
12053// performed.
12054static const std::optional<ByteProvider<SDValue>>
12055calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12056 unsigned Depth = 0) {
12057 // We may need to recursively traverse a series of SRLs
12058 if (Depth >= 6)
12059 return std::nullopt;
12060
12061 if (Op.getValueSizeInBits() < 8)
12062 return std::nullopt;
12063
12064 if (Op.getValueType().isVector())
12065 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12066
12067 switch (Op->getOpcode()) {
12068 case ISD::TRUNCATE: {
12069 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12070 }
12071
12072 case ISD::SIGN_EXTEND:
12073 case ISD::ZERO_EXTEND:
12075 SDValue NarrowOp = Op->getOperand(0);
12076 auto NarrowVT = NarrowOp.getValueType();
12077 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12078 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12079 NarrowVT = VTSign->getVT();
12080 }
12081 if (!NarrowVT.isByteSized())
12082 return std::nullopt;
12083 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12084
12085 if (SrcIndex >= NarrowByteWidth)
12086 return std::nullopt;
12087 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12088 }
12089
12090 case ISD::SRA:
12091 case ISD::SRL: {
12092 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12093 if (!ShiftOp)
12094 return std::nullopt;
12095
12096 uint64_t BitShift = ShiftOp->getZExtValue();
12097
12098 if (BitShift % 8 != 0)
12099 return std::nullopt;
12100
12101 SrcIndex += BitShift / 8;
12102
12103 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12104 }
12105
12106 default: {
12107 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12108 }
12109 }
12110 llvm_unreachable("fully handled switch");
12111}
12112
12113// For a byte position in the result of an Or, traverse the tree and find the
12114// node (and the byte of the node) which ultimately provides this {Or,
12115// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12116// the byte position of the Op that corresponds with the originally requested
12117// byte of the Or \p Depth tracks how many recursive iterations we have
12118// performed. \p StartingIndex is the originally requested byte of the Or
12119static const std::optional<ByteProvider<SDValue>>
12120calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12121 unsigned StartingIndex = 0) {
12122 // Finding Src tree of RHS of or typically requires at least 1 additional
12123 // depth
12124 if (Depth > 6)
12125 return std::nullopt;
12126
12127 unsigned BitWidth = Op.getScalarValueSizeInBits();
12128 if (BitWidth % 8 != 0)
12129 return std::nullopt;
12130 if (Index > BitWidth / 8 - 1)
12131 return std::nullopt;
12132
12133 bool IsVec = Op.getValueType().isVector();
12134 switch (Op.getOpcode()) {
12135 case ISD::OR: {
12136 if (IsVec)
12137 return std::nullopt;
12138
12139 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12140 StartingIndex);
12141 if (!RHS)
12142 return std::nullopt;
12143 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12144 StartingIndex);
12145 if (!LHS)
12146 return std::nullopt;
12147 // A well formed Or will have two ByteProviders for each byte, one of which
12148 // is constant zero
12149 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12150 return std::nullopt;
12151 if (!LHS || LHS->isConstantZero())
12152 return RHS;
12153 if (!RHS || RHS->isConstantZero())
12154 return LHS;
12155 return std::nullopt;
12156 }
12157
12158 case ISD::AND: {
12159 if (IsVec)
12160 return std::nullopt;
12161
12162 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12163 if (!BitMaskOp)
12164 return std::nullopt;
12165
12166 uint32_t BitMask = BitMaskOp->getZExtValue();
12167 // Bits we expect for our StartingIndex
12168 uint32_t IndexMask = 0xFF << (Index * 8);
12169
12170 if ((IndexMask & BitMask) != IndexMask) {
12171 // If the result of the and partially provides the byte, then it
12172 // is not well formatted
12173 if (IndexMask & BitMask)
12174 return std::nullopt;
12176 }
12177
12178 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12179 }
12180
12181 case ISD::FSHR: {
12182 if (IsVec)
12183 return std::nullopt;
12184
12185 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12186 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12187 if (!ShiftOp || Op.getValueType().isVector())
12188 return std::nullopt;
12189
12190 uint64_t BitsProvided = Op.getValueSizeInBits();
12191 if (BitsProvided % 8 != 0)
12192 return std::nullopt;
12193
12194 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12195 if (BitShift % 8)
12196 return std::nullopt;
12197
12198 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12199 uint64_t ByteShift = BitShift / 8;
12200
12201 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12202 uint64_t BytesProvided = BitsProvided / 8;
12203 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12204 NewIndex %= BytesProvided;
12205 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12206 }
12207
12208 case ISD::SRA:
12209 case ISD::SRL: {
12210 if (IsVec)
12211 return std::nullopt;
12212
12213 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12214 if (!ShiftOp)
12215 return std::nullopt;
12216
12217 uint64_t BitShift = ShiftOp->getZExtValue();
12218 if (BitShift % 8)
12219 return std::nullopt;
12220
12221 auto BitsProvided = Op.getScalarValueSizeInBits();
12222 if (BitsProvided % 8 != 0)
12223 return std::nullopt;
12224
12225 uint64_t BytesProvided = BitsProvided / 8;
12226 uint64_t ByteShift = BitShift / 8;
12227 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12228 // If the byte we are trying to provide (as tracked by index) falls in this
12229 // range, then the SRL provides the byte. The byte of interest of the src of
12230 // the SRL is Index + ByteShift
12231 return BytesProvided - ByteShift > Index
12232 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12233 Index + ByteShift)
12235 }
12236
12237 case ISD::SHL: {
12238 if (IsVec)
12239 return std::nullopt;
12240
12241 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12242 if (!ShiftOp)
12243 return std::nullopt;
12244
12245 uint64_t BitShift = ShiftOp->getZExtValue();
12246 if (BitShift % 8 != 0)
12247 return std::nullopt;
12248 uint64_t ByteShift = BitShift / 8;
12249
12250 // If we are shifting by an amount greater than (or equal to)
12251 // the index we are trying to provide, then it provides 0s. If not,
12252 // then this bytes are not definitively 0s, and the corresponding byte
12253 // of interest is Index - ByteShift of the src
12254 return Index < ByteShift
12256 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12257 Depth + 1, StartingIndex);
12258 }
12259 case ISD::ANY_EXTEND:
12260 case ISD::SIGN_EXTEND:
12261 case ISD::ZERO_EXTEND:
12263 case ISD::AssertZext:
12264 case ISD::AssertSext: {
12265 if (IsVec)
12266 return std::nullopt;
12267
12268 SDValue NarrowOp = Op->getOperand(0);
12269 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12270 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12271 Op->getOpcode() == ISD::AssertZext ||
12272 Op->getOpcode() == ISD::AssertSext) {
12273 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12274 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12275 }
12276 if (NarrowBitWidth % 8 != 0)
12277 return std::nullopt;
12278 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12279
12280 if (Index >= NarrowByteWidth)
12281 return Op.getOpcode() == ISD::ZERO_EXTEND
12282 ? std::optional<ByteProvider<SDValue>>(
12284 : std::nullopt;
12285 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12286 }
12287
12288 case ISD::TRUNCATE: {
12289 if (IsVec)
12290 return std::nullopt;
12291
12292 uint64_t NarrowByteWidth = BitWidth / 8;
12293
12294 if (NarrowByteWidth >= Index) {
12295 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12296 StartingIndex);
12297 }
12298
12299 return std::nullopt;
12300 }
12301
12302 case ISD::CopyFromReg: {
12303 if (BitWidth / 8 > Index)
12304 return calculateSrcByte(Op, StartingIndex, Index);
12305
12306 return std::nullopt;
12307 }
12308
12309 case ISD::LOAD: {
12310 auto *L = cast<LoadSDNode>(Op.getNode());
12311
12312 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12313 if (NarrowBitWidth % 8 != 0)
12314 return std::nullopt;
12315 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12316
12317 // If the width of the load does not reach byte we are trying to provide for
12318 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12319 // question
12320 if (Index >= NarrowByteWidth) {
12321 return L->getExtensionType() == ISD::ZEXTLOAD
12322 ? std::optional<ByteProvider<SDValue>>(
12324 : std::nullopt;
12325 }
12326
12327 if (NarrowByteWidth > Index) {
12328 return calculateSrcByte(Op, StartingIndex, Index);
12329 }
12330
12331 return std::nullopt;
12332 }
12333
12334 case ISD::BSWAP: {
12335 if (IsVec)
12336 return std::nullopt;
12337
12338 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12339 Depth + 1, StartingIndex);
12340 }
12341
12343 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12344 if (!IdxOp)
12345 return std::nullopt;
12346 auto VecIdx = IdxOp->getZExtValue();
12347 auto ScalarSize = Op.getScalarValueSizeInBits();
12348 if (ScalarSize < 32)
12349 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12350 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12351 StartingIndex, Index);
12352 }
12353
12354 case AMDGPUISD::PERM: {
12355 if (IsVec)
12356 return std::nullopt;
12357
12358 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12359 if (!PermMask)
12360 return std::nullopt;
12361
12362 auto IdxMask =
12363 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12364 if (IdxMask > 0x07 && IdxMask != 0x0c)
12365 return std::nullopt;
12366
12367 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12368 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12369
12370 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12373 }
12374
12375 default: {
12376 return std::nullopt;
12377 }
12378 }
12379
12380 llvm_unreachable("fully handled switch");
12381}
12382
12383// Returns true if the Operand is a scalar and is 16 bits
12384static bool isExtendedFrom16Bits(SDValue &Operand) {
12385
12386 switch (Operand.getOpcode()) {
12387 case ISD::ANY_EXTEND:
12388 case ISD::SIGN_EXTEND:
12389 case ISD::ZERO_EXTEND: {
12390 auto OpVT = Operand.getOperand(0).getValueType();
12391 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12392 }
12393 case ISD::LOAD: {
12394 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12395 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12396 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12397 ExtType == ISD::EXTLOAD) {
12398 auto MemVT = L->getMemoryVT();
12399 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12400 }
12401 return L->getMemoryVT().getSizeInBits() == 16;
12402 }
12403 default:
12404 return false;
12405 }
12406}
12407
12408// Returns true if the mask matches consecutive bytes, and the first byte
12409// begins at a power of 2 byte offset from 0th byte
12410static bool addresses16Bits(int Mask) {
12411 int Low8 = Mask & 0xff;
12412 int Hi8 = (Mask & 0xff00) >> 8;
12413
12414 assert(Low8 < 8 && Hi8 < 8);
12415 // Are the bytes contiguous in the order of increasing addresses.
12416 bool IsConsecutive = (Hi8 - Low8 == 1);
12417 // Is the first byte at location that is aligned for 16 bit instructions.
12418 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12419 // In this case, we still need code to extract the 16 bit operand, so it
12420 // is better to use i8 v_perm
12421 bool Is16Aligned = !(Low8 % 2);
12422
12423 return IsConsecutive && Is16Aligned;
12424}
12425
12426// Do not lower into v_perm if the operands are actually 16 bit
12427// and the selected bits (based on PermMask) correspond with two
12428// easily addressable 16 bit operands.
12430 SDValue &OtherOp) {
12431 int Low16 = PermMask & 0xffff;
12432 int Hi16 = (PermMask & 0xffff0000) >> 16;
12433
12434 auto TempOp = peekThroughBitcasts(Op);
12435 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12436
12437 auto OpIs16Bit =
12438 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12439 if (!OpIs16Bit)
12440 return true;
12441
12442 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12443 isExtendedFrom16Bits(TempOtherOp);
12444 if (!OtherOpIs16Bit)
12445 return true;
12446
12447 // Do we cleanly address both
12448 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12449}
12450
12452 unsigned DWordOffset) {
12453 SDValue Ret;
12454
12455 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12456 // ByteProvider must be at least 8 bits
12457 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12458
12459 if (TypeSize <= 32)
12460 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12461
12462 if (Src.getValueType().isVector()) {
12463 auto ScalarTySize = Src.getScalarValueSizeInBits();
12464 auto ScalarTy = Src.getValueType().getScalarType();
12465 if (ScalarTySize == 32) {
12466 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12467 DAG.getConstant(DWordOffset, SL, MVT::i32));
12468 }
12469 if (ScalarTySize > 32) {
12470 Ret = DAG.getNode(
12471 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12472 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12473 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12474 if (ShiftVal)
12475 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12476 DAG.getConstant(ShiftVal, SL, MVT::i32));
12477 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12478 }
12479
12480 assert(ScalarTySize < 32);
12481 auto NumElements = TypeSize / ScalarTySize;
12482 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12483 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12484 auto NumElementsIn32 = 32 / ScalarTySize;
12485 auto NumAvailElements = DWordOffset < Trunc32Elements
12486 ? NumElementsIn32
12487 : NumElements - NormalizedTrunc;
12488
12490 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12491 NumAvailElements);
12492
12493 Ret = DAG.getBuildVector(
12494 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12495 VecSrcs);
12496 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12497 }
12498
12499 /// Scalar Type
12500 auto ShiftVal = 32 * DWordOffset;
12501 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12502 DAG.getConstant(ShiftVal, SL, MVT::i32));
12503 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12504}
12505
12507 SelectionDAG &DAG = DCI.DAG;
12508 [[maybe_unused]] EVT VT = N->getValueType(0);
12510
12511 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12512 assert(VT == MVT::i32);
12513 for (int i = 0; i < 4; i++) {
12514 // Find the ByteProvider that provides the ith byte of the result of OR
12515 std::optional<ByteProvider<SDValue>> P =
12516 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12517 // TODO support constantZero
12518 if (!P || P->isConstantZero())
12519 return SDValue();
12520
12521 PermNodes.push_back(*P);
12522 }
12523 if (PermNodes.size() != 4)
12524 return SDValue();
12525
12526 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12527 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12528 uint64_t PermMask = 0x00000000;
12529 for (size_t i = 0; i < PermNodes.size(); i++) {
12530 auto PermOp = PermNodes[i];
12531 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12532 // by sizeof(Src2) = 4
12533 int SrcByteAdjust = 4;
12534
12535 // If the Src uses a byte from a different DWORD, then it corresponds
12536 // with a difference source
12537 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12538 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12539 if (SecondSrc)
12540 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12541 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12542 return SDValue();
12543
12544 // Set the index of the second distinct Src node
12545 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12546 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12547 SrcByteAdjust = 0;
12548 }
12549 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12551 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12552 }
12553 SDLoc DL(N);
12554 SDValue Op = *PermNodes[FirstSrc.first].Src;
12555 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12556 assert(Op.getValueSizeInBits() == 32);
12557
12558 // Check that we are not just extracting the bytes in order from an op
12559 if (!SecondSrc) {
12560 int Low16 = PermMask & 0xffff;
12561 int Hi16 = (PermMask & 0xffff0000) >> 16;
12562
12563 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12564 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12565
12566 // The perm op would really just produce Op. So combine into Op
12567 if (WellFormedLow && WellFormedHi)
12568 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12569 }
12570
12571 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12572
12573 if (SecondSrc) {
12574 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12575 assert(OtherOp.getValueSizeInBits() == 32);
12576 }
12577
12578 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12579
12580 assert(Op.getValueType().isByteSized() &&
12581 OtherOp.getValueType().isByteSized());
12582
12583 // If the ultimate src is less than 32 bits, then we will only be
12584 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12585 // CalculateByteProvider would not have returned Op as source if we
12586 // used a byte that is outside its ValueType. Thus, we are free to
12587 // ANY_EXTEND as the extended bits are dont-cares.
12588 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12589 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12590
12591 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12592 DAG.getConstant(PermMask, DL, MVT::i32));
12593 }
12594 return SDValue();
12595}
12596
12597SDValue SITargetLowering::performOrCombine(SDNode *N,
12598 DAGCombinerInfo &DCI) const {
12599 SelectionDAG &DAG = DCI.DAG;
12600 SDValue LHS = N->getOperand(0);
12601 SDValue RHS = N->getOperand(1);
12602
12603 EVT VT = N->getValueType(0);
12604 if (VT == MVT::i1) {
12605 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12606 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12607 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12608 SDValue Src = LHS.getOperand(0);
12609 if (Src != RHS.getOperand(0))
12610 return SDValue();
12611
12612 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12613 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12614 if (!CLHS || !CRHS)
12615 return SDValue();
12616
12617 // Only 10 bits are used.
12618 static const uint32_t MaxMask = 0x3ff;
12619
12620 uint32_t NewMask =
12621 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12622 SDLoc DL(N);
12623 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12624 DAG.getConstant(NewMask, DL, MVT::i32));
12625 }
12626
12627 return SDValue();
12628 }
12629
12630 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12631 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12632 LHS.getOpcode() == AMDGPUISD::PERM &&
12633 isa<ConstantSDNode>(LHS.getOperand(2))) {
12634 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12635 if (!Sel)
12636 return SDValue();
12637
12638 Sel |= LHS.getConstantOperandVal(2);
12639 SDLoc DL(N);
12640 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12641 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12642 }
12643
12644 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12646 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12647 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12648
12649 // If all the uses of an or need to extract the individual elements, do not
12650 // attempt to lower into v_perm
12651 auto usesCombinedOperand = [](SDNode *OrUse) {
12652 // If we have any non-vectorized use, then it is a candidate for v_perm
12653 if (OrUse->getOpcode() != ISD::BITCAST ||
12654 !OrUse->getValueType(0).isVector())
12655 return true;
12656
12657 // If we have any non-vectorized use, then it is a candidate for v_perm
12658 for (auto *VUser : OrUse->users()) {
12659 if (!VUser->getValueType(0).isVector())
12660 return true;
12661
12662 // If the use of a vector is a store, then combining via a v_perm
12663 // is beneficial.
12664 // TODO -- whitelist more uses
12665 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12666 if (VUser->getOpcode() == VectorwiseOp)
12667 return true;
12668 }
12669 return false;
12670 };
12671
12672 if (!any_of(N->users(), usesCombinedOperand))
12673 return SDValue();
12674
12675 uint32_t LHSMask = getPermuteMask(LHS);
12676 uint32_t RHSMask = getPermuteMask(RHS);
12677
12678 if (LHSMask != ~0u && RHSMask != ~0u) {
12679 // Canonicalize the expression in an attempt to have fewer unique masks
12680 // and therefore fewer registers used to hold the masks.
12681 if (LHSMask > RHSMask) {
12682 std::swap(LHSMask, RHSMask);
12683 std::swap(LHS, RHS);
12684 }
12685
12686 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12687 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12688 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12689 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12690
12691 // Check of we need to combine values from two sources within a byte.
12692 if (!(LHSUsedLanes & RHSUsedLanes) &&
12693 // If we select high and lower word keep it for SDWA.
12694 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12695 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12696 // Kill zero bytes selected by other mask. Zero value is 0xc.
12697 LHSMask &= ~RHSUsedLanes;
12698 RHSMask &= ~LHSUsedLanes;
12699 // Add 4 to each active LHS lane
12700 LHSMask |= LHSUsedLanes & 0x04040404;
12701 // Combine masks
12702 uint32_t Sel = LHSMask | RHSMask;
12703 SDLoc DL(N);
12704
12705 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12706 RHS.getOperand(0),
12707 DAG.getConstant(Sel, DL, MVT::i32));
12708 }
12709 }
12710 if (LHSMask == ~0u || RHSMask == ~0u) {
12711 if (SDValue Perm = matchPERM(N, DCI))
12712 return Perm;
12713 }
12714 }
12715
12716 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12717 return SDValue();
12718
12719 // TODO: This could be a generic combine with a predicate for extracting the
12720 // high half of an integer being free.
12721
12722 // (or i64:x, (zero_extend i32:y)) ->
12723 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12724 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12725 RHS.getOpcode() != ISD::ZERO_EXTEND)
12726 std::swap(LHS, RHS);
12727
12728 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12729 SDValue ExtSrc = RHS.getOperand(0);
12730 EVT SrcVT = ExtSrc.getValueType();
12731 if (SrcVT == MVT::i32) {
12732 SDLoc SL(N);
12733 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12734 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12735
12736 DCI.AddToWorklist(LowOr.getNode());
12737 DCI.AddToWorklist(HiBits.getNode());
12738
12739 SDValue Vec =
12740 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12741 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12742 }
12743 }
12744
12745 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12746 if (CRHS) {
12747 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12748 N->getOperand(0), CRHS))
12749 return Split;
12750 }
12751
12752 return SDValue();
12753}
12754
12755SDValue SITargetLowering::performXorCombine(SDNode *N,
12756 DAGCombinerInfo &DCI) const {
12757 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12758 return RV;
12759
12760 SDValue LHS = N->getOperand(0);
12761 SDValue RHS = N->getOperand(1);
12762
12763 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12764 SelectionDAG &DAG = DCI.DAG;
12765
12766 EVT VT = N->getValueType(0);
12767 if (CRHS && VT == MVT::i64) {
12768 if (SDValue Split =
12769 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12770 return Split;
12771 }
12772
12773 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12774 // fneg-like xors into 64-bit select.
12775 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12776 // This looks like an fneg, try to fold as a source modifier.
12777 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12778 shouldFoldFNegIntoSrc(N, LHS)) {
12779 // xor (select c, a, b), 0x80000000 ->
12780 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12781 SDLoc DL(N);
12782 SDValue CastLHS =
12783 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12784 SDValue CastRHS =
12785 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12786 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12787 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12788 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12789 LHS->getOperand(0), FNegLHS, FNegRHS);
12790 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12791 }
12792 }
12793
12794 return SDValue();
12795}
12796
12797SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12798 DAGCombinerInfo &DCI) const {
12799 if (!Subtarget->has16BitInsts() ||
12800 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12801 return SDValue();
12802
12803 EVT VT = N->getValueType(0);
12804 if (VT != MVT::i32)
12805 return SDValue();
12806
12807 SDValue Src = N->getOperand(0);
12808 if (Src.getValueType() != MVT::i16)
12809 return SDValue();
12810
12811 return SDValue();
12812}
12813
12814SDValue
12815SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12816 DAGCombinerInfo &DCI) const {
12817 SDValue Src = N->getOperand(0);
12818 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12819
12820 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12821 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12822 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12823 VTSign->getVT() == MVT::i8) ||
12824 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12825 VTSign->getVT() == MVT::i16))) {
12826 assert(Subtarget->hasScalarSubwordLoads() &&
12827 "s_buffer_load_{u8, i8} are supported "
12828 "in GFX12 (or newer) architectures.");
12829 EVT VT = Src.getValueType();
12830 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12833 SDLoc DL(N);
12834 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12835 SDValue Ops[] = {
12836 Src.getOperand(0), // source register
12837 Src.getOperand(1), // offset
12838 Src.getOperand(2) // cachePolicy
12839 };
12840 auto *M = cast<MemSDNode>(Src);
12841 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12842 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12843 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12844 return LoadVal;
12845 }
12846 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12847 VTSign->getVT() == MVT::i8) ||
12848 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12849 VTSign->getVT() == MVT::i16)) &&
12850 Src.hasOneUse()) {
12851 auto *M = cast<MemSDNode>(Src);
12852 SDValue Ops[] = {Src.getOperand(0), // Chain
12853 Src.getOperand(1), // rsrc
12854 Src.getOperand(2), // vindex
12855 Src.getOperand(3), // voffset
12856 Src.getOperand(4), // soffset
12857 Src.getOperand(5), // offset
12858 Src.getOperand(6), Src.getOperand(7)};
12859 // replace with BUFFER_LOAD_BYTE/SHORT
12860 SDVTList ResList =
12861 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12862 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12865 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12866 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12867 return DCI.DAG.getMergeValues(
12868 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12869 }
12870 return SDValue();
12871}
12872
12873SDValue SITargetLowering::performClassCombine(SDNode *N,
12874 DAGCombinerInfo &DCI) const {
12875 SelectionDAG &DAG = DCI.DAG;
12876 SDValue Mask = N->getOperand(1);
12877
12878 // fp_class x, 0 -> false
12879 if (isNullConstant(Mask))
12880 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12881
12882 if (N->getOperand(0).isUndef())
12883 return DAG.getUNDEF(MVT::i1);
12884
12885 return SDValue();
12886}
12887
12888SDValue SITargetLowering::performRcpCombine(SDNode *N,
12889 DAGCombinerInfo &DCI) const {
12890 EVT VT = N->getValueType(0);
12891 SDValue N0 = N->getOperand(0);
12892
12893 if (N0.isUndef()) {
12894 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12895 SDLoc(N), VT);
12896 }
12897
12898 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12899 N0.getOpcode() == ISD::SINT_TO_FP)) {
12900 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12901 N->getFlags());
12902 }
12903
12904 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12905 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12906 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12907 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12908 N->getFlags());
12909 }
12910
12912}
12913
12915 unsigned MaxDepth) const {
12916 unsigned Opcode = Op.getOpcode();
12917 if (Opcode == ISD::FCANONICALIZE)
12918 return true;
12919
12920 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12921 const auto &F = CFP->getValueAPF();
12922 if (F.isNaN() && F.isSignaling())
12923 return false;
12924 if (!F.isDenormal())
12925 return true;
12926
12927 DenormalMode Mode =
12928 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12929 return Mode == DenormalMode::getIEEE();
12930 }
12931
12932 // If source is a result of another standard FP operation it is already in
12933 // canonical form.
12934 if (MaxDepth == 0)
12935 return false;
12936
12937 switch (Opcode) {
12938 // These will flush denorms if required.
12939 case ISD::FADD:
12940 case ISD::FSUB:
12941 case ISD::FMUL:
12942 case ISD::FCEIL:
12943 case ISD::FFLOOR:
12944 case ISD::FMA:
12945 case ISD::FMAD:
12946 case ISD::FSQRT:
12947 case ISD::FDIV:
12948 case ISD::FREM:
12949 case ISD::FP_ROUND:
12950 case ISD::FP_EXTEND:
12951 case ISD::FP16_TO_FP:
12952 case ISD::FP_TO_FP16:
12953 case ISD::BF16_TO_FP:
12954 case ISD::FP_TO_BF16:
12955 case ISD::FLDEXP:
12958 case AMDGPUISD::RCP:
12959 case AMDGPUISD::RSQ:
12963 case AMDGPUISD::LOG:
12964 case AMDGPUISD::EXP:
12968 case AMDGPUISD::FRACT:
12975 case AMDGPUISD::SIN_HW:
12976 case AMDGPUISD::COS_HW:
12977 return true;
12978
12979 // It can/will be lowered or combined as a bit operation.
12980 // Need to check their input recursively to handle.
12981 case ISD::FNEG:
12982 case ISD::FABS:
12983 case ISD::FCOPYSIGN:
12984 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12985
12986 case ISD::AND:
12987 if (Op.getValueType() == MVT::i32) {
12988 // Be careful as we only know it is a bitcast floating point type. It
12989 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12990 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12991 // is valid to optimize for all types.
12992 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12993 if (RHS->getZExtValue() == 0xffff0000) {
12994 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12995 }
12996 }
12997 }
12998 break;
12999
13000 case ISD::FSIN:
13001 case ISD::FCOS:
13002 case ISD::FSINCOS:
13003 return Op.getValueType().getScalarType() != MVT::f16;
13004
13005 case ISD::FMINNUM:
13006 case ISD::FMAXNUM:
13007 case ISD::FMINNUM_IEEE:
13008 case ISD::FMAXNUM_IEEE:
13009 case ISD::FMINIMUM:
13010 case ISD::FMAXIMUM:
13011 case AMDGPUISD::CLAMP:
13012 case AMDGPUISD::FMED3:
13013 case AMDGPUISD::FMAX3:
13014 case AMDGPUISD::FMIN3:
13016 case AMDGPUISD::FMINIMUM3: {
13017 // FIXME: Shouldn't treat the generic operations different based these.
13018 // However, we aren't really required to flush the result from
13019 // minnum/maxnum..
13020
13021 // snans will be quieted, so we only need to worry about denormals.
13022 if (Subtarget->supportsMinMaxDenormModes() ||
13023 // FIXME: denormalsEnabledForType is broken for dynamic
13024 denormalsEnabledForType(DAG, Op.getValueType()))
13025 return true;
13026
13027 // Flushing may be required.
13028 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13029 // targets need to check their input recursively.
13030
13031 // FIXME: Does this apply with clamp? It's implemented with max.
13032 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13033 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13034 return false;
13035 }
13036
13037 return true;
13038 }
13039 case ISD::SELECT: {
13040 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13041 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13042 }
13043 case ISD::BUILD_VECTOR: {
13044 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13045 SDValue SrcOp = Op.getOperand(i);
13046 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13047 return false;
13048 }
13049
13050 return true;
13051 }
13054 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13055 }
13057 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13058 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13059 }
13060 case ISD::UNDEF:
13061 // Could be anything.
13062 return false;
13063
13064 case ISD::BITCAST:
13065 // TODO: This is incorrect as it loses track of the operand's type. We may
13066 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13067 // same bits that are canonicalized in one type need not be in the other.
13068 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13069 case ISD::TRUNCATE: {
13070 // Hack round the mess we make when legalizing extract_vector_elt
13071 if (Op.getValueType() == MVT::i16) {
13072 SDValue TruncSrc = Op.getOperand(0);
13073 if (TruncSrc.getValueType() == MVT::i32 &&
13074 TruncSrc.getOpcode() == ISD::BITCAST &&
13075 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13076 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13077 }
13078 }
13079 return false;
13080 }
13082 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13083 // TODO: Handle more intrinsics
13084 switch (IntrinsicID) {
13085 case Intrinsic::amdgcn_cvt_pkrtz:
13086 case Intrinsic::amdgcn_cubeid:
13087 case Intrinsic::amdgcn_frexp_mant:
13088 case Intrinsic::amdgcn_fdot2:
13089 case Intrinsic::amdgcn_rcp:
13090 case Intrinsic::amdgcn_rsq:
13091 case Intrinsic::amdgcn_rsq_clamp:
13092 case Intrinsic::amdgcn_rcp_legacy:
13093 case Intrinsic::amdgcn_rsq_legacy:
13094 case Intrinsic::amdgcn_trig_preop:
13095 case Intrinsic::amdgcn_log:
13096 case Intrinsic::amdgcn_exp2:
13097 case Intrinsic::amdgcn_sqrt:
13098 return true;
13099 default:
13100 break;
13101 }
13102
13103 break;
13104 }
13105 default:
13106 break;
13107 }
13108
13109 // FIXME: denormalsEnabledForType is broken for dynamic
13110 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13111 DAG.isKnownNeverSNaN(Op);
13112}
13113
13115 unsigned MaxDepth) const {
13116 const MachineRegisterInfo &MRI = MF.getRegInfo();
13117 MachineInstr *MI = MRI.getVRegDef(Reg);
13118 unsigned Opcode = MI->getOpcode();
13119
13120 if (Opcode == AMDGPU::G_FCANONICALIZE)
13121 return true;
13122
13123 std::optional<FPValueAndVReg> FCR;
13124 // Constant splat (can be padded with undef) or scalar constant.
13125 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13126 if (FCR->Value.isSignaling())
13127 return false;
13128 if (!FCR->Value.isDenormal())
13129 return true;
13130
13131 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13132 return Mode == DenormalMode::getIEEE();
13133 }
13134
13135 if (MaxDepth == 0)
13136 return false;
13137
13138 switch (Opcode) {
13139 case AMDGPU::G_FADD:
13140 case AMDGPU::G_FSUB:
13141 case AMDGPU::G_FMUL:
13142 case AMDGPU::G_FCEIL:
13143 case AMDGPU::G_FFLOOR:
13144 case AMDGPU::G_FRINT:
13145 case AMDGPU::G_FNEARBYINT:
13146 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13147 case AMDGPU::G_INTRINSIC_TRUNC:
13148 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13149 case AMDGPU::G_FMA:
13150 case AMDGPU::G_FMAD:
13151 case AMDGPU::G_FSQRT:
13152 case AMDGPU::G_FDIV:
13153 case AMDGPU::G_FREM:
13154 case AMDGPU::G_FPOW:
13155 case AMDGPU::G_FPEXT:
13156 case AMDGPU::G_FLOG:
13157 case AMDGPU::G_FLOG2:
13158 case AMDGPU::G_FLOG10:
13159 case AMDGPU::G_FPTRUNC:
13160 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13161 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13162 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13163 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13164 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13165 return true;
13166 case AMDGPU::G_FNEG:
13167 case AMDGPU::G_FABS:
13168 case AMDGPU::G_FCOPYSIGN:
13169 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13170 case AMDGPU::G_FMINNUM:
13171 case AMDGPU::G_FMAXNUM:
13172 case AMDGPU::G_FMINNUM_IEEE:
13173 case AMDGPU::G_FMAXNUM_IEEE:
13174 case AMDGPU::G_FMINIMUM:
13175 case AMDGPU::G_FMAXIMUM: {
13176 if (Subtarget->supportsMinMaxDenormModes() ||
13177 // FIXME: denormalsEnabledForType is broken for dynamic
13178 denormalsEnabledForType(MRI.getType(Reg), MF))
13179 return true;
13180
13181 [[fallthrough]];
13182 }
13183 case AMDGPU::G_BUILD_VECTOR:
13184 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13185 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13186 return false;
13187 return true;
13188 case AMDGPU::G_INTRINSIC:
13189 case AMDGPU::G_INTRINSIC_CONVERGENT:
13190 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13191 case Intrinsic::amdgcn_fmul_legacy:
13192 case Intrinsic::amdgcn_fmad_ftz:
13193 case Intrinsic::amdgcn_sqrt:
13194 case Intrinsic::amdgcn_fmed3:
13195 case Intrinsic::amdgcn_sin:
13196 case Intrinsic::amdgcn_cos:
13197 case Intrinsic::amdgcn_log:
13198 case Intrinsic::amdgcn_exp2:
13199 case Intrinsic::amdgcn_log_clamp:
13200 case Intrinsic::amdgcn_rcp:
13201 case Intrinsic::amdgcn_rcp_legacy:
13202 case Intrinsic::amdgcn_rsq:
13203 case Intrinsic::amdgcn_rsq_clamp:
13204 case Intrinsic::amdgcn_rsq_legacy:
13205 case Intrinsic::amdgcn_div_scale:
13206 case Intrinsic::amdgcn_div_fmas:
13207 case Intrinsic::amdgcn_div_fixup:
13208 case Intrinsic::amdgcn_fract:
13209 case Intrinsic::amdgcn_cvt_pkrtz:
13210 case Intrinsic::amdgcn_cubeid:
13211 case Intrinsic::amdgcn_cubema:
13212 case Intrinsic::amdgcn_cubesc:
13213 case Intrinsic::amdgcn_cubetc:
13214 case Intrinsic::amdgcn_frexp_mant:
13215 case Intrinsic::amdgcn_fdot2:
13216 case Intrinsic::amdgcn_trig_preop:
13217 return true;
13218 default:
13219 break;
13220 }
13221
13222 [[fallthrough]];
13223 default:
13224 return false;
13225 }
13226
13227 llvm_unreachable("invalid operation");
13228}
13229
13230// Constant fold canonicalize.
13231SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13232 const SDLoc &SL, EVT VT,
13233 const APFloat &C) const {
13234 // Flush denormals to 0 if not enabled.
13235 if (C.isDenormal()) {
13236 DenormalMode Mode =
13237 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13238 if (Mode == DenormalMode::getPreserveSign()) {
13239 return DAG.getConstantFP(
13240 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13241 }
13242
13243 if (Mode != DenormalMode::getIEEE())
13244 return SDValue();
13245 }
13246
13247 if (C.isNaN()) {
13248 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13249 if (C.isSignaling()) {
13250 // Quiet a signaling NaN.
13251 // FIXME: Is this supposed to preserve payload bits?
13252 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13253 }
13254
13255 // Make sure it is the canonical NaN bitpattern.
13256 //
13257 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13258 // immediate?
13259 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13260 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13261 }
13262
13263 // Already canonical.
13264 return DAG.getConstantFP(C, SL, VT);
13265}
13266
13268 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13269}
13270
13271SDValue
13272SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13273 DAGCombinerInfo &DCI) const {
13274 SelectionDAG &DAG = DCI.DAG;
13275 SDValue N0 = N->getOperand(0);
13276 EVT VT = N->getValueType(0);
13277
13278 // fcanonicalize undef -> qnan
13279 if (N0.isUndef()) {
13281 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13282 }
13283
13284 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13285 EVT VT = N->getValueType(0);
13286 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13287 }
13288
13289 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13290 // (fcanonicalize k)
13291 //
13292 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13293
13294 // TODO: This could be better with wider vectors that will be split to v2f16,
13295 // and to consider uses since there aren't that many packed operations.
13296 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13297 isTypeLegal(MVT::v2f16)) {
13298 SDLoc SL(N);
13299 SDValue NewElts[2];
13300 SDValue Lo = N0.getOperand(0);
13301 SDValue Hi = N0.getOperand(1);
13302 EVT EltVT = Lo.getValueType();
13303
13305 for (unsigned I = 0; I != 2; ++I) {
13306 SDValue Op = N0.getOperand(I);
13307 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13308 NewElts[I] =
13309 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13310 } else if (Op.isUndef()) {
13311 // Handled below based on what the other operand is.
13312 NewElts[I] = Op;
13313 } else {
13314 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13315 }
13316 }
13317
13318 // If one half is undef, and one is constant, prefer a splat vector rather
13319 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13320 // cheaper to use and may be free with a packed operation.
13321 if (NewElts[0].isUndef()) {
13322 if (isa<ConstantFPSDNode>(NewElts[1]))
13323 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13324 ? NewElts[1]
13325 : DAG.getConstantFP(0.0f, SL, EltVT);
13326 }
13327
13328 if (NewElts[1].isUndef()) {
13329 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13330 ? NewElts[0]
13331 : DAG.getConstantFP(0.0f, SL, EltVT);
13332 }
13333
13334 return DAG.getBuildVector(VT, SL, NewElts);
13335 }
13336 }
13337
13338 return SDValue();
13339}
13340
13341static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13342 switch (Opc) {
13343 case ISD::FMAXNUM:
13344 case ISD::FMAXNUM_IEEE:
13345 return AMDGPUISD::FMAX3;
13346 case ISD::FMAXIMUM:
13347 return AMDGPUISD::FMAXIMUM3;
13348 case ISD::SMAX:
13349 return AMDGPUISD::SMAX3;
13350 case ISD::UMAX:
13351 return AMDGPUISD::UMAX3;
13352 case ISD::FMINNUM:
13353 case ISD::FMINNUM_IEEE:
13354 return AMDGPUISD::FMIN3;
13355 case ISD::FMINIMUM:
13356 return AMDGPUISD::FMINIMUM3;
13357 case ISD::SMIN:
13358 return AMDGPUISD::SMIN3;
13359 case ISD::UMIN:
13360 return AMDGPUISD::UMIN3;
13361 default:
13362 llvm_unreachable("Not a min/max opcode");
13363 }
13364}
13365
13366SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13367 const SDLoc &SL, SDValue Src,
13368 SDValue MinVal,
13369 SDValue MaxVal,
13370 bool Signed) const {
13371
13372 // med3 comes from
13373 // min(max(x, K0), K1), K0 < K1
13374 // max(min(x, K0), K1), K1 < K0
13375 //
13376 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13377 // min/max op.
13378 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13379 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13380
13381 if (!MinK || !MaxK)
13382 return SDValue();
13383
13384 if (Signed) {
13385 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13386 return SDValue();
13387 } else {
13388 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13389 return SDValue();
13390 }
13391
13392 EVT VT = MinK->getValueType(0);
13393 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13394 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13395 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13396
13397 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13398 // not available, but this is unlikely to be profitable as constants
13399 // will often need to be materialized & extended, especially on
13400 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13401 return SDValue();
13402}
13403
13405 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13406 return C;
13407
13408 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13409 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13410 return C;
13411 }
13412
13413 return nullptr;
13414}
13415
13416SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13417 const SDLoc &SL, SDValue Op0,
13418 SDValue Op1) const {
13420 if (!K1)
13421 return SDValue();
13422
13424 if (!K0)
13425 return SDValue();
13426
13427 // Ordered >= (although NaN inputs should have folded away by now).
13428 if (K0->getValueAPF() > K1->getValueAPF())
13429 return SDValue();
13430
13431 const MachineFunction &MF = DAG.getMachineFunction();
13433
13434 // TODO: Check IEEE bit enabled?
13435 EVT VT = Op0.getValueType();
13436 if (Info->getMode().DX10Clamp) {
13437 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13438 // hardware fmed3 behavior converting to a min.
13439 // FIXME: Should this be allowing -0.0?
13440 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13441 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13442 }
13443
13444 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13445 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13446 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13447 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13448 // then give the other result, which is different from med3 with a NaN
13449 // input.
13450 SDValue Var = Op0.getOperand(0);
13451 if (!DAG.isKnownNeverSNaN(Var))
13452 return SDValue();
13453
13455
13456 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13457 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13458 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13459 SDValue(K0, 0), SDValue(K1, 0));
13460 }
13461 }
13462
13463 return SDValue();
13464}
13465
13466/// \return true if the subtarget supports minimum3 and maximum3 with the given
13467/// base min/max opcode \p Opc for type \p VT.
13468static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13469 EVT VT) {
13470 switch (Opc) {
13471 case ISD::FMINNUM:
13472 case ISD::FMAXNUM:
13473 case ISD::FMINNUM_IEEE:
13474 case ISD::FMAXNUM_IEEE:
13477 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13478 case ISD::FMINIMUM:
13479 case ISD::FMAXIMUM:
13480 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13481 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13482 case ISD::SMAX:
13483 case ISD::SMIN:
13484 case ISD::UMAX:
13485 case ISD::UMIN:
13486 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13487 default:
13488 return false;
13489 }
13490
13491 llvm_unreachable("not a min/max opcode");
13492}
13493
13494SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13495 DAGCombinerInfo &DCI) const {
13496 SelectionDAG &DAG = DCI.DAG;
13497
13498 EVT VT = N->getValueType(0);
13499 unsigned Opc = N->getOpcode();
13500 SDValue Op0 = N->getOperand(0);
13501 SDValue Op1 = N->getOperand(1);
13502
13503 // Only do this if the inner op has one use since this will just increases
13504 // register pressure for no benefit.
13505
13506 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13507 // max(max(a, b), c) -> max3(a, b, c)
13508 // min(min(a, b), c) -> min3(a, b, c)
13509 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13510 SDLoc DL(N);
13511 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13512 Op0.getOperand(0), Op0.getOperand(1), Op1);
13513 }
13514
13515 // Try commuted.
13516 // max(a, max(b, c)) -> max3(a, b, c)
13517 // min(a, min(b, c)) -> min3(a, b, c)
13518 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13519 SDLoc DL(N);
13520 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13521 Op0, Op1.getOperand(0), Op1.getOperand(1));
13522 }
13523 }
13524
13525 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13526 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13527 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13528 if (SDValue Med3 = performIntMed3ImmCombine(
13529 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13530 return Med3;
13531 }
13532 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13533 if (SDValue Med3 = performIntMed3ImmCombine(
13534 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13535 return Med3;
13536 }
13537
13538 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13539 if (SDValue Med3 = performIntMed3ImmCombine(
13540 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13541 return Med3;
13542 }
13543 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13544 if (SDValue Med3 = performIntMed3ImmCombine(
13545 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13546 return Med3;
13547 }
13548
13549 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13550 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13551 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13552 (Opc == AMDGPUISD::FMIN_LEGACY &&
13553 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13554 (VT == MVT::f32 || VT == MVT::f64 ||
13555 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13556 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13557 Op0.hasOneUse()) {
13558 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13559 return Res;
13560 }
13561
13562 return SDValue();
13563}
13564
13566 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13567 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13568 // FIXME: Should this be allowing -0.0?
13569 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13570 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13571 }
13572 }
13573
13574 return false;
13575}
13576
13577// FIXME: Should only worry about snans for version with chain.
13578SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13579 DAGCombinerInfo &DCI) const {
13580 EVT VT = N->getValueType(0);
13581 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13582 // NaNs. With a NaN input, the order of the operands may change the result.
13583
13584 SelectionDAG &DAG = DCI.DAG;
13585 SDLoc SL(N);
13586
13587 SDValue Src0 = N->getOperand(0);
13588 SDValue Src1 = N->getOperand(1);
13589 SDValue Src2 = N->getOperand(2);
13590
13591 if (isClampZeroToOne(Src0, Src1)) {
13592 // const_a, const_b, x -> clamp is safe in all cases including signaling
13593 // nans.
13594 // FIXME: Should this be allowing -0.0?
13595 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13596 }
13597
13598 const MachineFunction &MF = DAG.getMachineFunction();
13600
13601 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13602 // handling no dx10-clamp?
13603 if (Info->getMode().DX10Clamp) {
13604 // If NaNs is clamped to 0, we are free to reorder the inputs.
13605
13606 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13607 std::swap(Src0, Src1);
13608
13609 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13610 std::swap(Src1, Src2);
13611
13612 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13613 std::swap(Src0, Src1);
13614
13615 if (isClampZeroToOne(Src1, Src2))
13616 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13617 }
13618
13619 return SDValue();
13620}
13621
13622SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13623 DAGCombinerInfo &DCI) const {
13624 SDValue Src0 = N->getOperand(0);
13625 SDValue Src1 = N->getOperand(1);
13626 if (Src0.isUndef() && Src1.isUndef())
13627 return DCI.DAG.getUNDEF(N->getValueType(0));
13628 return SDValue();
13629}
13630
13631// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13632// expanded into a set of cmp/select instructions.
13634 unsigned NumElem,
13635 bool IsDivergentIdx,
13636 const GCNSubtarget *Subtarget) {
13638 return false;
13639
13640 unsigned VecSize = EltSize * NumElem;
13641
13642 // Sub-dword vectors of size 2 dword or less have better implementation.
13643 if (VecSize <= 64 && EltSize < 32)
13644 return false;
13645
13646 // Always expand the rest of sub-dword instructions, otherwise it will be
13647 // lowered via memory.
13648 if (EltSize < 32)
13649 return true;
13650
13651 // Always do this if var-idx is divergent, otherwise it will become a loop.
13652 if (IsDivergentIdx)
13653 return true;
13654
13655 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13656 unsigned NumInsts = NumElem /* Number of compares */ +
13657 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13658
13659 // On some architectures (GFX9) movrel is not available and it's better
13660 // to expand.
13661 if (Subtarget->useVGPRIndexMode())
13662 return NumInsts <= 16;
13663
13664 // If movrel is available, use it instead of expanding for vector of 8
13665 // elements.
13666 if (Subtarget->hasMovrel())
13667 return NumInsts <= 15;
13668
13669 return true;
13670}
13671
13673 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13674 if (isa<ConstantSDNode>(Idx))
13675 return false;
13676
13677 SDValue Vec = N->getOperand(0);
13678 EVT VecVT = Vec.getValueType();
13679 EVT EltVT = VecVT.getVectorElementType();
13680 unsigned EltSize = EltVT.getSizeInBits();
13681 unsigned NumElem = VecVT.getVectorNumElements();
13682
13684 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13685}
13686
13687SDValue
13688SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13689 DAGCombinerInfo &DCI) const {
13690 SDValue Vec = N->getOperand(0);
13691 SelectionDAG &DAG = DCI.DAG;
13692
13693 EVT VecVT = Vec.getValueType();
13694 EVT VecEltVT = VecVT.getVectorElementType();
13695 EVT ResVT = N->getValueType(0);
13696
13697 unsigned VecSize = VecVT.getSizeInBits();
13698 unsigned VecEltSize = VecEltVT.getSizeInBits();
13699
13700 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13702 SDLoc SL(N);
13703 SDValue Idx = N->getOperand(1);
13704 SDValue Elt =
13705 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13706 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13707 }
13708
13709 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13710 // =>
13711 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13712 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13713 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13714 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13715 SDLoc SL(N);
13716 SDValue Idx = N->getOperand(1);
13717 unsigned Opc = Vec.getOpcode();
13718
13719 switch (Opc) {
13720 default:
13721 break;
13722 // TODO: Support other binary operations.
13723 case ISD::FADD:
13724 case ISD::FSUB:
13725 case ISD::FMUL:
13726 case ISD::ADD:
13727 case ISD::UMIN:
13728 case ISD::UMAX:
13729 case ISD::SMIN:
13730 case ISD::SMAX:
13731 case ISD::FMAXNUM:
13732 case ISD::FMINNUM:
13733 case ISD::FMAXNUM_IEEE:
13734 case ISD::FMINNUM_IEEE:
13735 case ISD::FMAXIMUM:
13736 case ISD::FMINIMUM: {
13737 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13738 Vec.getOperand(0), Idx);
13739 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13740 Vec.getOperand(1), Idx);
13741
13742 DCI.AddToWorklist(Elt0.getNode());
13743 DCI.AddToWorklist(Elt1.getNode());
13744 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13745 }
13746 }
13747 }
13748
13749 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13751 SDLoc SL(N);
13752 SDValue Idx = N->getOperand(1);
13753 SDValue V;
13754 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13755 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13756 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13757 if (I == 0)
13758 V = Elt;
13759 else
13760 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13761 }
13762 return V;
13763 }
13764
13765 if (!DCI.isBeforeLegalize())
13766 return SDValue();
13767
13768 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13769 // elements. This exposes more load reduction opportunities by replacing
13770 // multiple small extract_vector_elements with a single 32-bit extract.
13771 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13772 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13773 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13774 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13775
13776 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13777 unsigned EltIdx = BitIndex / 32;
13778 unsigned LeftoverBitIdx = BitIndex % 32;
13779 SDLoc SL(N);
13780
13781 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13782 DCI.AddToWorklist(Cast.getNode());
13783
13784 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13785 DAG.getConstant(EltIdx, SL, MVT::i32));
13786 DCI.AddToWorklist(Elt.getNode());
13787 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13788 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13789 DCI.AddToWorklist(Srl.getNode());
13790
13791 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13792 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13793 DCI.AddToWorklist(Trunc.getNode());
13794
13795 if (VecEltVT == ResVT) {
13796 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13797 }
13798
13799 assert(ResVT.isScalarInteger());
13800 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13801 }
13802
13803 return SDValue();
13804}
13805
13806SDValue
13807SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13808 DAGCombinerInfo &DCI) const {
13809 SDValue Vec = N->getOperand(0);
13810 SDValue Idx = N->getOperand(2);
13811 EVT VecVT = Vec.getValueType();
13812 EVT EltVT = VecVT.getVectorElementType();
13813
13814 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13815 // => BUILD_VECTOR n x select (e, const-idx)
13817 return SDValue();
13818
13819 SelectionDAG &DAG = DCI.DAG;
13820 SDLoc SL(N);
13821 SDValue Ins = N->getOperand(1);
13822 EVT IdxVT = Idx.getValueType();
13823
13825 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13826 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13827 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13828 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13829 Ops.push_back(V);
13830 }
13831
13832 return DAG.getBuildVector(VecVT, SL, Ops);
13833}
13834
13835/// Return the source of an fp_extend from f16 to f32, or a converted FP
13836/// constant.
13838 if (Src.getOpcode() == ISD::FP_EXTEND &&
13839 Src.getOperand(0).getValueType() == MVT::f16) {
13840 return Src.getOperand(0);
13841 }
13842
13843 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13844 APFloat Val = CFP->getValueAPF();
13845 bool LosesInfo = true;
13847 if (!LosesInfo)
13848 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13849 }
13850
13851 return SDValue();
13852}
13853
13854SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13855 DAGCombinerInfo &DCI) const {
13856 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13857 "combine only useful on gfx8");
13858
13859 SDValue TruncSrc = N->getOperand(0);
13860 EVT VT = N->getValueType(0);
13861 if (VT != MVT::f16)
13862 return SDValue();
13863
13864 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13865 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13866 return SDValue();
13867
13868 SelectionDAG &DAG = DCI.DAG;
13869 SDLoc SL(N);
13870
13871 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13872 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13873 // casting back.
13874
13875 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13876 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13877 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13878 if (!A)
13879 return SDValue();
13880
13881 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13882 if (!B)
13883 return SDValue();
13884
13885 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13886 if (!C)
13887 return SDValue();
13888
13889 // This changes signaling nan behavior. If an input is a signaling nan, it
13890 // would have been quieted by the fpext originally. We don't care because
13891 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13892 // we would be worse off than just doing the promotion.
13893 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13894 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13895 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13896 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13897}
13898
13899unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13900 const SDNode *N0,
13901 const SDNode *N1) const {
13902 EVT VT = N0->getValueType(0);
13903
13904 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13905 // support denormals ever.
13906 if (((VT == MVT::f32 &&
13908 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13911 return ISD::FMAD;
13912
13913 const TargetOptions &Options = DAG.getTarget().Options;
13914 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13915 (N0->getFlags().hasAllowContract() &&
13916 N1->getFlags().hasAllowContract())) &&
13918 return ISD::FMA;
13919 }
13920
13921 return 0;
13922}
13923
13924// For a reassociatable opcode perform:
13925// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13926SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13927 SelectionDAG &DAG) const {
13928 EVT VT = N->getValueType(0);
13929 if (VT != MVT::i32 && VT != MVT::i64)
13930 return SDValue();
13931
13932 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13933 return SDValue();
13934
13935 unsigned Opc = N->getOpcode();
13936 SDValue Op0 = N->getOperand(0);
13937 SDValue Op1 = N->getOperand(1);
13938
13939 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13940 return SDValue();
13941
13942 if (Op0->isDivergent())
13943 std::swap(Op0, Op1);
13944
13945 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13946 return SDValue();
13947
13948 SDValue Op2 = Op1.getOperand(1);
13949 Op1 = Op1.getOperand(0);
13950 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13951 return SDValue();
13952
13953 if (Op1->isDivergent())
13954 std::swap(Op1, Op2);
13955
13956 SDLoc SL(N);
13957 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13958 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13959}
13960
13961static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13962 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13964 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13965 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13966 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13967}
13968
13969// Fold
13970// y = lshr i64 x, 32
13971// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
13972// with Const.hi == -1
13973// To
13974// res = mad_u64_u32 y.lo ,Const.lo, x.lo
13976 SDValue MulLHS, SDValue MulRHS,
13977 SDValue AddRHS) {
13978 if (MulRHS.getOpcode() == ISD::SRL)
13979 std::swap(MulLHS, MulRHS);
13980
13981 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
13982 return SDValue();
13983
13984 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
13985 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
13986 MulLHS.getOperand(0) != AddRHS)
13987 return SDValue();
13988
13989 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
13990 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
13991 return SDValue();
13992
13993 SDValue ConstMul =
13994 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
13995 return getMad64_32(DAG, SL, MVT::i64,
13996 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
13997 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
13998}
13999
14000// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14001// multiplies, if any.
14002//
14003// Full 64-bit multiplies that feed into an addition are lowered here instead
14004// of using the generic expansion. The generic expansion ends up with
14005// a tree of ADD nodes that prevents us from using the "add" part of the
14006// MAD instruction. The expansion produced here results in a chain of ADDs
14007// instead of a tree.
14008SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14009 DAGCombinerInfo &DCI) const {
14010 assert(N->getOpcode() == ISD::ADD);
14011
14012 SelectionDAG &DAG = DCI.DAG;
14013 EVT VT = N->getValueType(0);
14014 SDLoc SL(N);
14015 SDValue LHS = N->getOperand(0);
14016 SDValue RHS = N->getOperand(1);
14017
14018 if (VT.isVector())
14019 return SDValue();
14020
14021 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14022 // result in scalar registers for uniform values.
14023 if (!N->isDivergent() && Subtarget->hasSMulHi())
14024 return SDValue();
14025
14026 unsigned NumBits = VT.getScalarSizeInBits();
14027 if (NumBits <= 32 || NumBits > 64)
14028 return SDValue();
14029
14030 if (LHS.getOpcode() != ISD::MUL) {
14031 assert(RHS.getOpcode() == ISD::MUL);
14032 std::swap(LHS, RHS);
14033 }
14034
14035 // Avoid the fold if it would unduly increase the number of multiplies due to
14036 // multiple uses, except on hardware with full-rate multiply-add (which is
14037 // part of full-rate 64-bit ops).
14038 if (!Subtarget->hasFullRate64Ops()) {
14039 unsigned NumUsers = 0;
14040 for (SDNode *User : LHS->users()) {
14041 // There is a use that does not feed into addition, so the multiply can't
14042 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14043 if (User->getOpcode() != ISD::ADD)
14044 return SDValue();
14045
14046 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14047 // MUL + 3xADD + 3xADDC over 3xMAD.
14048 ++NumUsers;
14049 if (NumUsers >= 3)
14050 return SDValue();
14051 }
14052 }
14053
14054 SDValue MulLHS = LHS.getOperand(0);
14055 SDValue MulRHS = LHS.getOperand(1);
14056 SDValue AddRHS = RHS;
14057
14058 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14059 return FoldedMAD;
14060
14061 // Always check whether operands are small unsigned values, since that
14062 // knowledge is useful in more cases. Check for small signed values only if
14063 // doing so can unlock a shorter code sequence.
14064 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14065 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14066
14067 bool MulSignedLo = false;
14068 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14069 MulSignedLo =
14070 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14071 }
14072
14073 // The operands and final result all have the same number of bits. If
14074 // operands need to be extended, they can be extended with garbage. The
14075 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14076 // truncated away in the end.
14077 if (VT != MVT::i64) {
14078 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14079 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14080 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14081 }
14082
14083 // The basic code generated is conceptually straightforward. Pseudo code:
14084 //
14085 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14086 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14087 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14088 //
14089 // The second and third lines are optional, depending on whether the factors
14090 // are {sign,zero}-extended or not.
14091 //
14092 // The actual DAG is noisier than the pseudo code, but only due to
14093 // instructions that disassemble values into low and high parts, and
14094 // assemble the final result.
14095 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14096
14097 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14098 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14099 SDValue Accum =
14100 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14101
14102 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14103 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14104
14105 if (!MulLHSUnsigned32) {
14106 auto MulLHSHi =
14107 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14108 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14109 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14110 }
14111
14112 if (!MulRHSUnsigned32) {
14113 auto MulRHSHi =
14114 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14115 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14116 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14117 }
14118
14119 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14120 Accum = DAG.getBitcast(MVT::i64, Accum);
14121 }
14122
14123 if (VT != MVT::i64)
14124 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14125 return Accum;
14126}
14127
14128SDValue
14129SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14130 DAGCombinerInfo &DCI) const {
14131 SDValue RHS = N->getOperand(1);
14132 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14133 if (!CRHS)
14134 return SDValue();
14135
14136 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14137 // common.
14138 uint64_t Val = CRHS->getZExtValue();
14139 if (countr_zero(Val) >= 32) {
14140 SelectionDAG &DAG = DCI.DAG;
14141 SDLoc SL(N);
14142 SDValue LHS = N->getOperand(0);
14143
14144 // Avoid carry machinery if we know the low half of the add does not
14145 // contribute to the final result.
14146 //
14147 // add i64:x, K if computeTrailingZeros(K) >= 32
14148 // => build_pair (add x.hi, K.hi), x.lo
14149
14150 // Breaking the 64-bit add here with this strange constant is unlikely
14151 // to interfere with addressing mode patterns.
14152
14153 SDValue Hi = getHiHalf64(LHS, DAG);
14154 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14155 SDValue AddHi =
14156 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14157
14158 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14159 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14160 }
14161
14162 return SDValue();
14163}
14164
14165// Collect the ultimate src of each of the mul node's operands, and confirm
14166// each operand is 8 bytes.
14167static std::optional<ByteProvider<SDValue>>
14168handleMulOperand(const SDValue &MulOperand) {
14169 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14170 if (!Byte0 || Byte0->isConstantZero()) {
14171 return std::nullopt;
14172 }
14173 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14174 if (Byte1 && !Byte1->isConstantZero()) {
14175 return std::nullopt;
14176 }
14177 return Byte0;
14178}
14179
14180static unsigned addPermMasks(unsigned First, unsigned Second) {
14181 unsigned FirstCs = First & 0x0c0c0c0c;
14182 unsigned SecondCs = Second & 0x0c0c0c0c;
14183 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14184 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14185
14186 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14187 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14188 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14189 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14190
14191 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14192}
14193
14194struct DotSrc {
14196 int64_t PermMask;
14198};
14199
14203 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14204
14205 assert(Src0.Src.has_value() && Src1.Src.has_value());
14206 // Src0s and Src1s are empty, just place arbitrarily.
14207 if (Step == 0) {
14208 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14209 Src0.SrcOffset / 4});
14210 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14211 Src1.SrcOffset / 4});
14212 return;
14213 }
14214
14215 for (int BPI = 0; BPI < 2; BPI++) {
14216 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14217 if (BPI == 1) {
14218 BPP = {Src1, Src0};
14219 }
14220 unsigned ZeroMask = 0x0c0c0c0c;
14221 unsigned FMask = 0xFF << (8 * (3 - Step));
14222
14223 unsigned FirstMask =
14224 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14225 unsigned SecondMask =
14226 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14227 // Attempt to find Src vector which contains our SDValue, if so, add our
14228 // perm mask to the existing one. If we are unable to find a match for the
14229 // first SDValue, attempt to find match for the second.
14230 int FirstGroup = -1;
14231 for (int I = 0; I < 2; I++) {
14232 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14233 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14234 return IterElt.SrcOp == *BPP.first.Src &&
14235 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14236 };
14237
14238 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14239 if (Match != Srcs.end()) {
14240 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14241 FirstGroup = I;
14242 break;
14243 }
14244 }
14245 if (FirstGroup != -1) {
14246 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14247 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14248 return IterElt.SrcOp == *BPP.second.Src &&
14249 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14250 };
14251 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14252 if (Match != Srcs.end()) {
14253 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14254 } else
14255 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14256 return;
14257 }
14258 }
14259
14260 // If we have made it here, then we could not find a match in Src0s or Src1s
14261 // for either Src0 or Src1, so just place them arbitrarily.
14262
14263 unsigned ZeroMask = 0x0c0c0c0c;
14264 unsigned FMask = 0xFF << (8 * (3 - Step));
14265
14266 Src0s.push_back(
14267 {*Src0.Src,
14268 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14269 Src0.SrcOffset / 4});
14270 Src1s.push_back(
14271 {*Src1.Src,
14272 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14273 Src1.SrcOffset / 4});
14274}
14275
14277 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14278 bool IsAny) {
14279
14280 // If we just have one source, just permute it accordingly.
14281 if (Srcs.size() == 1) {
14282 auto *Elt = Srcs.begin();
14283 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14284
14285 // v_perm will produce the original value
14286 if (Elt->PermMask == 0x3020100)
14287 return EltOp;
14288
14289 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14290 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14291 }
14292
14293 auto *FirstElt = Srcs.begin();
14294 auto *SecondElt = std::next(FirstElt);
14295
14297
14298 // If we have multiple sources in the chain, combine them via perms (using
14299 // calculated perm mask) and Ors.
14300 while (true) {
14301 auto FirstMask = FirstElt->PermMask;
14302 auto SecondMask = SecondElt->PermMask;
14303
14304 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14305 unsigned FirstPlusFour = FirstMask | 0x04040404;
14306 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14307 // original 0x0C.
14308 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14309
14310 auto PermMask = addPermMasks(FirstMask, SecondMask);
14311 auto FirstVal =
14312 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14313 auto SecondVal =
14314 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14315
14316 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14317 SecondVal,
14318 DAG.getConstant(PermMask, SL, MVT::i32)));
14319
14320 FirstElt = std::next(SecondElt);
14321 if (FirstElt == Srcs.end())
14322 break;
14323
14324 SecondElt = std::next(FirstElt);
14325 // If we only have a FirstElt, then just combine that into the cumulative
14326 // source node.
14327 if (SecondElt == Srcs.end()) {
14328 auto EltOp =
14329 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14330
14331 Perms.push_back(
14332 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14333 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14334 break;
14335 }
14336 }
14337
14338 assert(Perms.size() == 1 || Perms.size() == 2);
14339 return Perms.size() == 2
14340 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14341 : Perms[0];
14342}
14343
14344static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14345 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14346 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14347 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14348 EntryMask += ZeroMask;
14349 }
14350}
14351
14352static bool isMul(const SDValue Op) {
14353 auto Opcode = Op.getOpcode();
14354
14355 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14356 Opcode == AMDGPUISD::MUL_I24);
14357}
14358
14359static std::optional<bool>
14361 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14362 const SDValue &S1Op, const SelectionDAG &DAG) {
14363 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14364 // of the dot4 is irrelevant.
14365 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14366 return false;
14367
14368 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14369 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14370 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14371 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14372 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14373 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14374
14375 assert(!(S0IsUnsigned && S0IsSigned));
14376 assert(!(S1IsUnsigned && S1IsSigned));
14377
14378 // There are 9 possible permutations of
14379 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14380
14381 // In two permutations, the sign bits are known to be the same for both Ops,
14382 // so simply return Signed / Unsigned corresponding to the MSB
14383
14384 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14385 return S0IsSigned;
14386
14387 // In another two permutations, the sign bits are known to be opposite. In
14388 // this case return std::nullopt to indicate a bad match.
14389
14390 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14391 return std::nullopt;
14392
14393 // In the remaining five permutations, we don't know the value of the sign
14394 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14395 // the upper bits must be extension bits. Thus, the only ways for the sign
14396 // bit to be unknown is if it was sign extended from unknown value, or if it
14397 // was any extended. In either case, it is correct to use the signed
14398 // version of the signedness semantics of dot4
14399
14400 // In two of such permutations, we known the sign bit is set for
14401 // one op, and the other is unknown. It is okay to used signed version of
14402 // dot4.
14403 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14404 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14405 return true;
14406
14407 // In one such permutation, we don't know either of the sign bits. It is okay
14408 // to used the signed version of dot4.
14409 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14410 return true;
14411
14412 // In two of such permutations, we known the sign bit is unset for
14413 // one op, and the other is unknown. Return std::nullopt to indicate a
14414 // bad match.
14415 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14416 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14417 return std::nullopt;
14418
14419 llvm_unreachable("Fully covered condition");
14420}
14421
14422SDValue SITargetLowering::performAddCombine(SDNode *N,
14423 DAGCombinerInfo &DCI) const {
14424 SelectionDAG &DAG = DCI.DAG;
14425 EVT VT = N->getValueType(0);
14426 SDLoc SL(N);
14427 SDValue LHS = N->getOperand(0);
14428 SDValue RHS = N->getOperand(1);
14429
14430 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14431 if (Subtarget->hasMad64_32()) {
14432 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14433 return Folded;
14434 }
14435 }
14436
14437 if (SDValue V = reassociateScalarOps(N, DAG)) {
14438 return V;
14439 }
14440
14441 if (VT == MVT::i64) {
14442 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14443 return Folded;
14444 }
14445
14446 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14447 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14448 SDValue TempNode(N, 0);
14449 std::optional<bool> IsSigned;
14453
14454 // Match the v_dot4 tree, while collecting src nodes.
14455 int ChainLength = 0;
14456 for (int I = 0; I < 4; I++) {
14457 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14458 if (MulIdx == -1)
14459 break;
14460 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14461 if (!Src0)
14462 break;
14463 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14464 if (!Src1)
14465 break;
14466
14467 auto IterIsSigned = checkDot4MulSignedness(
14468 TempNode->getOperand(MulIdx), *Src0, *Src1,
14469 TempNode->getOperand(MulIdx)->getOperand(0),
14470 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14471 if (!IterIsSigned)
14472 break;
14473 if (!IsSigned)
14474 IsSigned = *IterIsSigned;
14475 if (*IterIsSigned != *IsSigned)
14476 break;
14477 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14478 auto AddIdx = 1 - MulIdx;
14479 // Allow the special case where add (add (mul24, 0), mul24) became ->
14480 // add (mul24, mul24).
14481 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14482 Src2s.push_back(TempNode->getOperand(AddIdx));
14483 auto Src0 =
14484 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14485 if (!Src0)
14486 break;
14487 auto Src1 =
14488 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14489 if (!Src1)
14490 break;
14491 auto IterIsSigned = checkDot4MulSignedness(
14492 TempNode->getOperand(AddIdx), *Src0, *Src1,
14493 TempNode->getOperand(AddIdx)->getOperand(0),
14494 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14495 if (!IterIsSigned)
14496 break;
14497 assert(IsSigned);
14498 if (*IterIsSigned != *IsSigned)
14499 break;
14500 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14501 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14502 ChainLength = I + 2;
14503 break;
14504 }
14505
14506 TempNode = TempNode->getOperand(AddIdx);
14507 Src2s.push_back(TempNode);
14508 ChainLength = I + 1;
14509 if (TempNode->getNumOperands() < 2)
14510 break;
14511 LHS = TempNode->getOperand(0);
14512 RHS = TempNode->getOperand(1);
14513 }
14514
14515 if (ChainLength < 2)
14516 return SDValue();
14517
14518 // Masks were constructed with assumption that we would find a chain of
14519 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14520 // 0x0c) so they do not affect dot calculation.
14521 if (ChainLength < 4) {
14522 fixMasks(Src0s, ChainLength);
14523 fixMasks(Src1s, ChainLength);
14524 }
14525
14526 SDValue Src0, Src1;
14527
14528 // If we are just using a single source for both, and have permuted the
14529 // bytes consistently, we can just use the sources without permuting
14530 // (commutation).
14531 bool UseOriginalSrc = false;
14532 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14533 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14534 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14535 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14536 SmallVector<unsigned, 4> SrcBytes;
14537 auto Src0Mask = Src0s.begin()->PermMask;
14538 SrcBytes.push_back(Src0Mask & 0xFF000000);
14539 bool UniqueEntries = true;
14540 for (auto I = 1; I < 4; I++) {
14541 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14542
14543 if (is_contained(SrcBytes, NextByte)) {
14544 UniqueEntries = false;
14545 break;
14546 }
14547 SrcBytes.push_back(NextByte);
14548 }
14549
14550 if (UniqueEntries) {
14551 UseOriginalSrc = true;
14552
14553 auto *FirstElt = Src0s.begin();
14554 auto FirstEltOp =
14555 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14556
14557 auto *SecondElt = Src1s.begin();
14558 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14559 SecondElt->DWordOffset);
14560
14561 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14562 MVT::getIntegerVT(32));
14563 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14564 MVT::getIntegerVT(32));
14565 }
14566 }
14567
14568 if (!UseOriginalSrc) {
14569 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14570 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14571 }
14572
14573 assert(IsSigned);
14574 SDValue Src2 =
14575 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14576
14577 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14578 : Intrinsic::amdgcn_udot4,
14579 SL, MVT::i64);
14580
14581 assert(!VT.isVector());
14582 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14583 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14584
14585 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14586 }
14587
14588 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14589 return SDValue();
14590
14591 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14592 // add x, sext (setcc) => usubo_carry x, 0, setcc
14593 unsigned Opc = LHS.getOpcode();
14594 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14595 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14596 std::swap(RHS, LHS);
14597
14598 Opc = RHS.getOpcode();
14599 switch (Opc) {
14600 default:
14601 break;
14602 case ISD::ZERO_EXTEND:
14603 case ISD::SIGN_EXTEND:
14604 case ISD::ANY_EXTEND: {
14605 auto Cond = RHS.getOperand(0);
14606 // If this won't be a real VOPC output, we would still need to insert an
14607 // extra instruction anyway.
14608 if (!isBoolSGPR(Cond))
14609 break;
14610 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14611 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14613 return DAG.getNode(Opc, SL, VTList, Args);
14614 }
14615 case ISD::UADDO_CARRY: {
14616 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14617 if (!isNullConstant(RHS.getOperand(1)))
14618 break;
14619 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14620 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14621 }
14622 }
14623 return SDValue();
14624}
14625
14626SDValue SITargetLowering::performSubCombine(SDNode *N,
14627 DAGCombinerInfo &DCI) const {
14628 SelectionDAG &DAG = DCI.DAG;
14629 EVT VT = N->getValueType(0);
14630
14631 if (VT == MVT::i64) {
14632 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14633 return Folded;
14634 }
14635
14636 if (VT != MVT::i32)
14637 return SDValue();
14638
14639 SDLoc SL(N);
14640 SDValue LHS = N->getOperand(0);
14641 SDValue RHS = N->getOperand(1);
14642
14643 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14644 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14645 unsigned Opc = RHS.getOpcode();
14646 switch (Opc) {
14647 default:
14648 break;
14649 case ISD::ZERO_EXTEND:
14650 case ISD::SIGN_EXTEND:
14651 case ISD::ANY_EXTEND: {
14652 auto Cond = RHS.getOperand(0);
14653 // If this won't be a real VOPC output, we would still need to insert an
14654 // extra instruction anyway.
14655 if (!isBoolSGPR(Cond))
14656 break;
14657 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14658 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14660 return DAG.getNode(Opc, SL, VTList, Args);
14661 }
14662 }
14663
14664 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14665 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14666 if (!isNullConstant(LHS.getOperand(1)))
14667 return SDValue();
14668 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14669 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14670 }
14671 return SDValue();
14672}
14673
14674SDValue
14675SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14676 DAGCombinerInfo &DCI) const {
14677
14678 if (N->getValueType(0) != MVT::i32)
14679 return SDValue();
14680
14681 if (!isNullConstant(N->getOperand(1)))
14682 return SDValue();
14683
14684 SelectionDAG &DAG = DCI.DAG;
14685 SDValue LHS = N->getOperand(0);
14686
14687 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14688 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14689 unsigned LHSOpc = LHS.getOpcode();
14690 unsigned Opc = N->getOpcode();
14691 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14692 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14693 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14694 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14695 }
14696 return SDValue();
14697}
14698
14699SDValue SITargetLowering::performFAddCombine(SDNode *N,
14700 DAGCombinerInfo &DCI) const {
14701 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14702 return SDValue();
14703
14704 SelectionDAG &DAG = DCI.DAG;
14705 EVT VT = N->getValueType(0);
14706
14707 SDLoc SL(N);
14708 SDValue LHS = N->getOperand(0);
14709 SDValue RHS = N->getOperand(1);
14710
14711 // These should really be instruction patterns, but writing patterns with
14712 // source modifiers is a pain.
14713
14714 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14715 if (LHS.getOpcode() == ISD::FADD) {
14716 SDValue A = LHS.getOperand(0);
14717 if (A == LHS.getOperand(1)) {
14718 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14719 if (FusedOp != 0) {
14720 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14721 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14722 }
14723 }
14724 }
14725
14726 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14727 if (RHS.getOpcode() == ISD::FADD) {
14728 SDValue A = RHS.getOperand(0);
14729 if (A == RHS.getOperand(1)) {
14730 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14731 if (FusedOp != 0) {
14732 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14733 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14734 }
14735 }
14736 }
14737
14738 return SDValue();
14739}
14740
14741SDValue SITargetLowering::performFSubCombine(SDNode *N,
14742 DAGCombinerInfo &DCI) const {
14743 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14744 return SDValue();
14745
14746 SelectionDAG &DAG = DCI.DAG;
14747 SDLoc SL(N);
14748 EVT VT = N->getValueType(0);
14749 assert(!VT.isVector());
14750
14751 // Try to get the fneg to fold into the source modifier. This undoes generic
14752 // DAG combines and folds them into the mad.
14753 //
14754 // Only do this if we are not trying to support denormals. v_mad_f32 does
14755 // not support denormals ever.
14756 SDValue LHS = N->getOperand(0);
14757 SDValue RHS = N->getOperand(1);
14758 if (LHS.getOpcode() == ISD::FADD) {
14759 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14760 SDValue A = LHS.getOperand(0);
14761 if (A == LHS.getOperand(1)) {
14762 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14763 if (FusedOp != 0) {
14764 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14765 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14766
14767 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14768 }
14769 }
14770 }
14771
14772 if (RHS.getOpcode() == ISD::FADD) {
14773 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14774
14775 SDValue A = RHS.getOperand(0);
14776 if (A == RHS.getOperand(1)) {
14777 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14778 if (FusedOp != 0) {
14779 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14780 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14781 }
14782 }
14783 }
14784
14785 return SDValue();
14786}
14787
14788SDValue SITargetLowering::performFDivCombine(SDNode *N,
14789 DAGCombinerInfo &DCI) const {
14790 SelectionDAG &DAG = DCI.DAG;
14791 SDLoc SL(N);
14792 EVT VT = N->getValueType(0);
14793 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14794 return SDValue();
14795
14796 SDValue LHS = N->getOperand(0);
14797 SDValue RHS = N->getOperand(1);
14798
14799 SDNodeFlags Flags = N->getFlags();
14800 SDNodeFlags RHSFlags = RHS->getFlags();
14801 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14802 !RHS->hasOneUse())
14803 return SDValue();
14804
14805 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14806 bool IsNegative = false;
14807 if (CLHS->isExactlyValue(1.0) ||
14808 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14809 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14810 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14811 if (RHS.getOpcode() == ISD::FSQRT) {
14812 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14813 SDValue Rsq =
14814 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14815 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14816 }
14817 }
14818 }
14819
14820 return SDValue();
14821}
14822
14823SDValue SITargetLowering::performFMulCombine(SDNode *N,
14824 DAGCombinerInfo &DCI) const {
14825 SelectionDAG &DAG = DCI.DAG;
14826 EVT VT = N->getValueType(0);
14827 EVT ScalarVT = VT.getScalarType();
14828 EVT IntVT = VT.changeElementType(MVT::i32);
14829
14830 SDValue LHS = N->getOperand(0);
14831 SDValue RHS = N->getOperand(1);
14832
14833 // It is cheaper to realize i32 inline constants as compared against
14834 // materializing f16 or f64 (or even non-inline f32) values,
14835 // possible via ldexp usage, as shown below :
14836 //
14837 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14838 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14839 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14840 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14841 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14842 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14843 if (!TrueNode)
14844 return SDValue();
14845 const ConstantFPSDNode *FalseNode =
14846 isConstOrConstSplatFP(RHS.getOperand(2));
14847 if (!FalseNode)
14848 return SDValue();
14849
14850 if (TrueNode->isNegative() != FalseNode->isNegative())
14851 return SDValue();
14852
14853 // For f32, only non-inline constants should be transformed.
14855 if (ScalarVT == MVT::f32 &&
14856 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14857 TII->isInlineConstant(FalseNode->getValueAPF()))
14858 return SDValue();
14859
14860 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14861 if (TrueNodeExpVal == INT_MIN)
14862 return SDValue();
14863 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14864 if (FalseNodeExpVal == INT_MIN)
14865 return SDValue();
14866
14867 SDLoc SL(N);
14868 SDValue SelectNode =
14869 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14870 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14871 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14872
14873 LHS = TrueNode->isNegative()
14874 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14875 : LHS;
14876
14877 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14878 }
14879
14880 return SDValue();
14881}
14882
14883SDValue SITargetLowering::performFMACombine(SDNode *N,
14884 DAGCombinerInfo &DCI) const {
14885 SelectionDAG &DAG = DCI.DAG;
14886 EVT VT = N->getValueType(0);
14887 SDLoc SL(N);
14888
14889 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14890 return SDValue();
14891
14892 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14893 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14894 SDValue Op1 = N->getOperand(0);
14895 SDValue Op2 = N->getOperand(1);
14896 SDValue FMA = N->getOperand(2);
14897
14898 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14899 Op2.getOpcode() != ISD::FP_EXTEND)
14900 return SDValue();
14901
14902 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14903 // regardless of the denorm mode setting. Therefore,
14904 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14905 const TargetOptions &Options = DAG.getTarget().Options;
14906 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14907 (N->getFlags().hasAllowContract() &&
14908 FMA->getFlags().hasAllowContract())) {
14909 Op1 = Op1.getOperand(0);
14910 Op2 = Op2.getOperand(0);
14911 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14913 return SDValue();
14914
14915 SDValue Vec1 = Op1.getOperand(0);
14916 SDValue Idx1 = Op1.getOperand(1);
14917 SDValue Vec2 = Op2.getOperand(0);
14918
14919 SDValue FMAOp1 = FMA.getOperand(0);
14920 SDValue FMAOp2 = FMA.getOperand(1);
14921 SDValue FMAAcc = FMA.getOperand(2);
14922
14923 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14924 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14925 return SDValue();
14926
14927 FMAOp1 = FMAOp1.getOperand(0);
14928 FMAOp2 = FMAOp2.getOperand(0);
14929 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14931 return SDValue();
14932
14933 SDValue Vec3 = FMAOp1.getOperand(0);
14934 SDValue Vec4 = FMAOp2.getOperand(0);
14935 SDValue Idx2 = FMAOp1.getOperand(1);
14936
14937 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14938 // Idx1 and Idx2 cannot be the same.
14939 Idx1 == Idx2)
14940 return SDValue();
14941
14942 if (Vec1 == Vec2 || Vec3 == Vec4)
14943 return SDValue();
14944
14945 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14946 return SDValue();
14947
14948 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14949 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14950 DAG.getTargetConstant(0, SL, MVT::i1));
14951 }
14952 }
14953 return SDValue();
14954}
14955
14956SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14957 DAGCombinerInfo &DCI) const {
14958 SelectionDAG &DAG = DCI.DAG;
14959 SDLoc SL(N);
14960
14961 SDValue LHS = N->getOperand(0);
14962 SDValue RHS = N->getOperand(1);
14963 EVT VT = LHS.getValueType();
14964 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14965
14966 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14967 if (!CRHS) {
14968 CRHS = dyn_cast<ConstantSDNode>(LHS);
14969 if (CRHS) {
14970 std::swap(LHS, RHS);
14972 }
14973 }
14974
14975 if (CRHS) {
14976 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14977 isBoolSGPR(LHS.getOperand(0))) {
14978 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14979 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14980 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14981 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14982 if ((CRHS->isAllOnes() &&
14983 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14984 (CRHS->isZero() &&
14985 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14986 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14987 DAG.getAllOnesConstant(SL, MVT::i1));
14988 if ((CRHS->isAllOnes() &&
14989 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14990 (CRHS->isZero() &&
14991 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14992 return LHS.getOperand(0);
14993 }
14994
14995 const APInt &CRHSVal = CRHS->getAPIntValue();
14996 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14997 LHS.getOpcode() == ISD::SELECT &&
14998 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14999 isa<ConstantSDNode>(LHS.getOperand(2)) &&
15000 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
15001 isBoolSGPR(LHS.getOperand(0))) {
15002 // Given CT != FT:
15003 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15004 // setcc (select cc, CT, CF), CF, ne => cc
15005 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15006 // setcc (select cc, CT, CF), CT, eq => cc
15007 const APInt &CT = LHS.getConstantOperandAPInt(1);
15008 const APInt &CF = LHS.getConstantOperandAPInt(2);
15009
15010 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15011 (CT == CRHSVal && CC == ISD::SETNE))
15012 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15013 DAG.getAllOnesConstant(SL, MVT::i1));
15014 if ((CF == CRHSVal && CC == ISD::SETNE) ||
15015 (CT == CRHSVal && CC == ISD::SETEQ))
15016 return LHS.getOperand(0);
15017 }
15018 }
15019
15020 if (VT != MVT::f32 && VT != MVT::f64 &&
15021 (!Subtarget->has16BitInsts() || VT != MVT::f16))
15022 return SDValue();
15023
15024 // Match isinf/isfinite pattern
15025 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15026 // (fcmp one (fabs x), inf) -> (fp_class x,
15027 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15028 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15029 LHS.getOpcode() == ISD::FABS) {
15030 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
15031 if (!CRHS)
15032 return SDValue();
15033
15034 const APFloat &APF = CRHS->getValueAPF();
15035 if (APF.isInfinity() && !APF.isNegative()) {
15036 const unsigned IsInfMask =
15038 const unsigned IsFiniteMask =
15042 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15043 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
15044 DAG.getConstant(Mask, SL, MVT::i32));
15045 }
15046 }
15047
15048 return SDValue();
15049}
15050
15051SDValue
15052SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15053 DAGCombinerInfo &DCI) const {
15054 SelectionDAG &DAG = DCI.DAG;
15055 SDLoc SL(N);
15056 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15057
15058 SDValue Src = N->getOperand(0);
15059 SDValue Shift = N->getOperand(0);
15060
15061 // TODO: Extend type shouldn't matter (assuming legal types).
15062 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15063 Shift = Shift.getOperand(0);
15064
15065 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15066 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
15067 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15068 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15069 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15070 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
15071 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
15072 SDValue Shifted = DAG.getZExtOrTrunc(
15073 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
15074
15075 unsigned ShiftOffset = 8 * Offset;
15076 if (Shift.getOpcode() == ISD::SHL)
15077 ShiftOffset -= C->getZExtValue();
15078 else
15079 ShiftOffset += C->getZExtValue();
15080
15081 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15082 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15083 MVT::f32, Shifted);
15084 }
15085 }
15086 }
15087
15088 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15089 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15090 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15091 // We simplified Src. If this node is not dead, visit it again so it is
15092 // folded properly.
15093 if (N->getOpcode() != ISD::DELETED_NODE)
15094 DCI.AddToWorklist(N);
15095 return SDValue(N, 0);
15096 }
15097
15098 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15099 if (SDValue DemandedSrc =
15101 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15102
15103 return SDValue();
15104}
15105
15106SDValue SITargetLowering::performClampCombine(SDNode *N,
15107 DAGCombinerInfo &DCI) const {
15108 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15109 if (!CSrc)
15110 return SDValue();
15111
15112 const MachineFunction &MF = DCI.DAG.getMachineFunction();
15113 const APFloat &F = CSrc->getValueAPF();
15114 APFloat Zero = APFloat::getZero(F.getSemantics());
15115 if (F < Zero ||
15116 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15117 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15118 }
15119
15120 APFloat One(F.getSemantics(), "1.0");
15121 if (F > One)
15122 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15123
15124 return SDValue(CSrc, 0);
15125}
15126
15128 DAGCombinerInfo &DCI) const {
15129 switch (N->getOpcode()) {
15130 case ISD::ADD:
15131 case ISD::SUB:
15132 case ISD::SHL:
15133 case ISD::SRL:
15134 case ISD::SRA:
15135 case ISD::AND:
15136 case ISD::OR:
15137 case ISD::XOR:
15138 case ISD::MUL:
15139 case ISD::SETCC:
15140 case ISD::SELECT:
15141 case ISD::SMIN:
15142 case ISD::SMAX:
15143 case ISD::UMIN:
15144 case ISD::UMAX:
15145 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15146 return Res;
15147 break;
15148 default:
15149 break;
15150 }
15151
15152 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15153 return SDValue();
15154
15155 switch (N->getOpcode()) {
15156 case ISD::ADD:
15157 return performAddCombine(N, DCI);
15158 case ISD::SUB:
15159 return performSubCombine(N, DCI);
15160 case ISD::UADDO_CARRY:
15161 case ISD::USUBO_CARRY:
15162 return performAddCarrySubCarryCombine(N, DCI);
15163 case ISD::FADD:
15164 return performFAddCombine(N, DCI);
15165 case ISD::FSUB:
15166 return performFSubCombine(N, DCI);
15167 case ISD::FDIV:
15168 return performFDivCombine(N, DCI);
15169 case ISD::FMUL:
15170 return performFMulCombine(N, DCI);
15171 case ISD::SETCC:
15172 return performSetCCCombine(N, DCI);
15173 case ISD::FMAXNUM:
15174 case ISD::FMINNUM:
15175 case ISD::FMAXNUM_IEEE:
15176 case ISD::FMINNUM_IEEE:
15177 case ISD::FMAXIMUM:
15178 case ISD::FMINIMUM:
15179 case ISD::SMAX:
15180 case ISD::SMIN:
15181 case ISD::UMAX:
15182 case ISD::UMIN:
15185 return performMinMaxCombine(N, DCI);
15186 case ISD::FMA:
15187 return performFMACombine(N, DCI);
15188 case ISD::AND:
15189 return performAndCombine(N, DCI);
15190 case ISD::OR:
15191 return performOrCombine(N, DCI);
15192 case ISD::FSHR: {
15194 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15195 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15196 return matchPERM(N, DCI);
15197 }
15198 break;
15199 }
15200 case ISD::XOR:
15201 return performXorCombine(N, DCI);
15202 case ISD::ZERO_EXTEND:
15203 return performZeroExtendCombine(N, DCI);
15205 return performSignExtendInRegCombine(N, DCI);
15207 return performClassCombine(N, DCI);
15208 case ISD::FCANONICALIZE:
15209 return performFCanonicalizeCombine(N, DCI);
15210 case AMDGPUISD::RCP:
15211 return performRcpCombine(N, DCI);
15212 case ISD::FLDEXP:
15213 case AMDGPUISD::FRACT:
15214 case AMDGPUISD::RSQ:
15217 case AMDGPUISD::RSQ_CLAMP: {
15218 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15219 SDValue Src = N->getOperand(0);
15220 if (Src.isUndef())
15221 return Src;
15222 break;
15223 }
15224 case ISD::SINT_TO_FP:
15225 case ISD::UINT_TO_FP:
15226 return performUCharToFloatCombine(N, DCI);
15227 case ISD::FCOPYSIGN:
15228 return performFCopySignCombine(N, DCI);
15233 return performCvtF32UByteNCombine(N, DCI);
15234 case AMDGPUISD::FMED3:
15235 return performFMed3Combine(N, DCI);
15237 return performCvtPkRTZCombine(N, DCI);
15238 case AMDGPUISD::CLAMP:
15239 return performClampCombine(N, DCI);
15240 case ISD::SCALAR_TO_VECTOR: {
15241 SelectionDAG &DAG = DCI.DAG;
15242 EVT VT = N->getValueType(0);
15243
15244 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15245 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15246 SDLoc SL(N);
15247 SDValue Src = N->getOperand(0);
15248 EVT EltVT = Src.getValueType();
15249 if (EltVT != MVT::i16)
15250 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15251
15252 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15253 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15254 }
15255
15256 break;
15257 }
15259 return performExtractVectorEltCombine(N, DCI);
15261 return performInsertVectorEltCombine(N, DCI);
15262 case ISD::FP_ROUND:
15263 return performFPRoundCombine(N, DCI);
15264 case ISD::LOAD: {
15265 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15266 return Widened;
15267 [[fallthrough]];
15268 }
15269 default: {
15270 if (!DCI.isBeforeLegalize()) {
15271 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15272 return performMemSDNodeCombine(MemNode, DCI);
15273 }
15274
15275 break;
15276 }
15277 }
15278
15280}
15281
15282/// Helper function for adjustWritemask
15283static unsigned SubIdx2Lane(unsigned Idx) {
15284 switch (Idx) {
15285 default:
15286 return ~0u;
15287 case AMDGPU::sub0:
15288 return 0;
15289 case AMDGPU::sub1:
15290 return 1;
15291 case AMDGPU::sub2:
15292 return 2;
15293 case AMDGPU::sub3:
15294 return 3;
15295 case AMDGPU::sub4:
15296 return 4; // Possible with TFE/LWE
15297 }
15298}
15299
15300/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15301SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15302 SelectionDAG &DAG) const {
15303 unsigned Opcode = Node->getMachineOpcode();
15304
15305 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15306 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15307 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15308 return Node; // not implemented for D16
15309
15310 SDNode *Users[5] = {nullptr};
15311 unsigned Lane = 0;
15312 unsigned DmaskIdx =
15313 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15314 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15315 unsigned NewDmask = 0;
15316 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15317 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15318 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15319 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15320 ? true
15321 : false;
15322 unsigned TFCLane = 0;
15323 bool HasChain = Node->getNumValues() > 1;
15324
15325 if (OldDmask == 0) {
15326 // These are folded out, but on the chance it happens don't assert.
15327 return Node;
15328 }
15329
15330 unsigned OldBitsSet = llvm::popcount(OldDmask);
15331 // Work out which is the TFE/LWE lane if that is enabled.
15332 if (UsesTFC) {
15333 TFCLane = OldBitsSet;
15334 }
15335
15336 // Try to figure out the used register components
15337 for (SDUse &Use : Node->uses()) {
15338
15339 // Don't look at users of the chain.
15340 if (Use.getResNo() != 0)
15341 continue;
15342
15343 SDNode *User = Use.getUser();
15344
15345 // Abort if we can't understand the usage
15346 if (!User->isMachineOpcode() ||
15347 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15348 return Node;
15349
15350 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15351 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15352 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15353 // set, etc.
15354 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15355 if (Lane == ~0u)
15356 return Node;
15357
15358 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15359 if (UsesTFC && Lane == TFCLane) {
15360 Users[Lane] = User;
15361 } else {
15362 // Set which texture component corresponds to the lane.
15363 unsigned Comp;
15364 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15365 Comp = llvm::countr_zero(Dmask);
15366 Dmask &= ~(1 << Comp);
15367 }
15368
15369 // Abort if we have more than one user per component.
15370 if (Users[Lane])
15371 return Node;
15372
15373 Users[Lane] = User;
15374 NewDmask |= 1 << Comp;
15375 }
15376 }
15377
15378 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15379 bool NoChannels = !NewDmask;
15380 if (NoChannels) {
15381 if (!UsesTFC) {
15382 // No uses of the result and not using TFC. Then do nothing.
15383 return Node;
15384 }
15385 // If the original dmask has one channel - then nothing to do
15386 if (OldBitsSet == 1)
15387 return Node;
15388 // Use an arbitrary dmask - required for the instruction to work
15389 NewDmask = 1;
15390 }
15391 // Abort if there's no change
15392 if (NewDmask == OldDmask)
15393 return Node;
15394
15395 unsigned BitsSet = llvm::popcount(NewDmask);
15396
15397 // Check for TFE or LWE - increase the number of channels by one to account
15398 // for the extra return value
15399 // This will need adjustment for D16 if this is also included in
15400 // adjustWriteMask (this function) but at present D16 are excluded.
15401 unsigned NewChannels = BitsSet + UsesTFC;
15402
15403 int NewOpcode =
15404 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15405 assert(NewOpcode != -1 &&
15406 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15407 "failed to find equivalent MIMG op");
15408
15409 // Adjust the writemask in the node
15411 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15412 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15413 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15414
15415 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15416
15417 MVT ResultVT = NewChannels == 1
15418 ? SVT
15419 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15420 : NewChannels == 5 ? 8
15421 : NewChannels);
15422 SDVTList NewVTList =
15423 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15424
15425 MachineSDNode *NewNode =
15426 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15427
15428 if (HasChain) {
15429 // Update chain.
15430 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15431 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15432 }
15433
15434 if (NewChannels == 1) {
15435 assert(Node->hasNUsesOfValue(1, 0));
15436 SDNode *Copy =
15437 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15438 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15439 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15440 return nullptr;
15441 }
15442
15443 // Update the users of the node with the new indices
15444 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15445 SDNode *User = Users[i];
15446 if (!User) {
15447 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15448 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15449 if (i || !NoChannels)
15450 continue;
15451 } else {
15452 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15453 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15454 if (NewUser != User) {
15455 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15456 DAG.RemoveDeadNode(User);
15457 }
15458 }
15459
15460 switch (Idx) {
15461 default:
15462 break;
15463 case AMDGPU::sub0:
15464 Idx = AMDGPU::sub1;
15465 break;
15466 case AMDGPU::sub1:
15467 Idx = AMDGPU::sub2;
15468 break;
15469 case AMDGPU::sub2:
15470 Idx = AMDGPU::sub3;
15471 break;
15472 case AMDGPU::sub3:
15473 Idx = AMDGPU::sub4;
15474 break;
15475 }
15476 }
15477
15478 DAG.RemoveDeadNode(Node);
15479 return nullptr;
15480}
15481
15483 if (Op.getOpcode() == ISD::AssertZext)
15484 Op = Op.getOperand(0);
15485
15486 return isa<FrameIndexSDNode>(Op);
15487}
15488
15489/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15490/// with frame index operands.
15491/// LLVM assumes that inputs are to these instructions are registers.
15492SDNode *
15494 SelectionDAG &DAG) const {
15495 if (Node->getOpcode() == ISD::CopyToReg) {
15496 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15497 SDValue SrcVal = Node->getOperand(2);
15498
15499 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15500 // to try understanding copies to physical registers.
15501 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15502 SDLoc SL(Node);
15504 SDValue VReg = DAG.getRegister(
15505 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15506
15507 SDNode *Glued = Node->getGluedNode();
15508 SDValue ToVReg = DAG.getCopyToReg(
15509 Node->getOperand(0), SL, VReg, SrcVal,
15510 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15511 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15512 VReg, ToVReg.getValue(1));
15513 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15514 DAG.RemoveDeadNode(Node);
15515 return ToResultReg.getNode();
15516 }
15517 }
15518
15520 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15521 if (!isFrameIndexOp(Node->getOperand(i))) {
15522 Ops.push_back(Node->getOperand(i));
15523 continue;
15524 }
15525
15526 SDLoc DL(Node);
15527 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15528 Node->getOperand(i).getValueType(),
15529 Node->getOperand(i)),
15530 0));
15531 }
15532
15533 return DAG.UpdateNodeOperands(Node, Ops);
15534}
15535
15536/// Fold the instructions after selecting them.
15537/// Returns null if users were already updated.
15539 SelectionDAG &DAG) const {
15541 unsigned Opcode = Node->getMachineOpcode();
15542
15543 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15544 !TII->isGather4(Opcode) &&
15545 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15546 return adjustWritemask(Node, DAG);
15547 }
15548
15549 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15551 return Node;
15552 }
15553
15554 switch (Opcode) {
15555 case AMDGPU::V_DIV_SCALE_F32_e64:
15556 case AMDGPU::V_DIV_SCALE_F64_e64: {
15557 // Satisfy the operand register constraint when one of the inputs is
15558 // undefined. Ordinarily each undef value will have its own implicit_def of
15559 // a vreg, so force these to use a single register.
15560 SDValue Src0 = Node->getOperand(1);
15561 SDValue Src1 = Node->getOperand(3);
15562 SDValue Src2 = Node->getOperand(5);
15563
15564 if ((Src0.isMachineOpcode() &&
15565 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15566 (Src0 == Src1 || Src0 == Src2))
15567 break;
15568
15569 MVT VT = Src0.getValueType().getSimpleVT();
15570 const TargetRegisterClass *RC =
15571 getRegClassFor(VT, Src0.getNode()->isDivergent());
15572
15574 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15575
15576 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15577 Src0, SDValue());
15578
15579 // src0 must be the same register as src1 or src2, even if the value is
15580 // undefined, so make sure we don't violate this constraint.
15581 if (Src0.isMachineOpcode() &&
15582 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15583 if (Src1.isMachineOpcode() &&
15584 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15585 Src0 = Src1;
15586 else if (Src2.isMachineOpcode() &&
15587 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15588 Src0 = Src2;
15589 else {
15590 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15591 Src0 = UndefReg;
15592 Src1 = UndefReg;
15593 }
15594 } else
15595 break;
15596
15597 SmallVector<SDValue, 9> Ops(Node->ops());
15598 Ops[1] = Src0;
15599 Ops[3] = Src1;
15600 Ops[5] = Src2;
15601 Ops.push_back(ImpDef.getValue(1));
15602 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15603 }
15604 default:
15605 break;
15606 }
15607
15608 return Node;
15609}
15610
15611// Any MIMG instructions that use tfe or lwe require an initialization of the
15612// result register that will be written in the case of a memory access failure.
15613// The required code is also added to tie this init code to the result of the
15614// img instruction.
15617 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15618 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15619 MachineBasicBlock &MBB = *MI.getParent();
15620
15621 int DstIdx =
15622 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15623 unsigned InitIdx = 0;
15624
15625 if (TII->isImage(MI)) {
15626 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15627 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15628 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15629
15630 if (!TFE && !LWE) // intersect_ray
15631 return;
15632
15633 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15634 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15635 unsigned D16Val = D16 ? D16->getImm() : 0;
15636
15637 if (!TFEVal && !LWEVal)
15638 return;
15639
15640 // At least one of TFE or LWE are non-zero
15641 // We have to insert a suitable initialization of the result value and
15642 // tie this to the dest of the image instruction.
15643
15644 // Calculate which dword we have to initialize to 0.
15645 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15646
15647 // check that dmask operand is found.
15648 assert(MO_Dmask && "Expected dmask operand in instruction");
15649
15650 unsigned dmask = MO_Dmask->getImm();
15651 // Determine the number of active lanes taking into account the
15652 // Gather4 special case
15653 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15654
15655 bool Packed = !Subtarget->hasUnpackedD16VMem();
15656
15657 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15658
15659 // Abandon attempt if the dst size isn't large enough
15660 // - this is in fact an error but this is picked up elsewhere and
15661 // reported correctly.
15662 uint32_t DstSize =
15663 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15664 if (DstSize < InitIdx)
15665 return;
15666 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15667 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15668 } else {
15669 return;
15670 }
15671
15672 const DebugLoc &DL = MI.getDebugLoc();
15673
15674 // Create a register for the initialization value.
15675 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15676 unsigned NewDst = 0; // Final initialized value will be in here
15677
15678 // If PRTStrictNull feature is enabled (the default) then initialize
15679 // all the result registers to 0, otherwise just the error indication
15680 // register (VGPRn+1)
15681 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15682 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15683
15684 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15685 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15686 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15687 // Initialize dword
15688 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15689 // clang-format off
15690 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15691 .addImm(0);
15692 // clang-format on
15693 // Insert into the super-reg
15694 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15695 .addReg(PrevDst)
15696 .addReg(SubReg)
15698
15699 PrevDst = NewDst;
15700 }
15701
15702 // Add as an implicit operand
15703 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15704
15705 // Tie the just added implicit operand to the dst
15706 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15707}
15708
15709/// Assign the register class depending on the number of
15710/// bits set in the writemask
15712 SDNode *Node) const {
15714
15715 MachineFunction *MF = MI.getParent()->getParent();
15718
15719 if (TII->isVOP3(MI.getOpcode())) {
15720 // Make sure constant bus requirements are respected.
15721 TII->legalizeOperandsVOP3(MRI, MI);
15722
15723 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15724 // This saves a chain-copy of registers and better balance register
15725 // use between vgpr and agpr as agpr tuples tend to be big.
15726 if (!MI.getDesc().operands().empty()) {
15727 unsigned Opc = MI.getOpcode();
15728 bool HasAGPRs = Info->mayNeedAGPRs();
15729 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15730 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15731 for (auto I :
15732 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15733 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15734 if (I == -1)
15735 break;
15736 if ((I == Src2Idx) && (HasAGPRs))
15737 break;
15738 MachineOperand &Op = MI.getOperand(I);
15739 if (!Op.isReg() || !Op.getReg().isVirtual())
15740 continue;
15741 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15742 if (!TRI->hasAGPRs(RC))
15743 continue;
15744 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15745 if (!Src || !Src->isCopy() ||
15746 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15747 continue;
15748 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15749 // All uses of agpr64 and agpr32 can also accept vgpr except for
15750 // v_accvgpr_read, but we do not produce agpr reads during selection,
15751 // so no use checks are needed.
15752 MRI.setRegClass(Op.getReg(), NewRC);
15753 }
15754
15755 if (TII->isMAI(MI)) {
15756 // The ordinary src0, src1, src2 were legalized above.
15757 //
15758 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15759 // as a separate instruction.
15760 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15761 AMDGPU::OpName::scale_src0);
15762 if (Src0Idx != -1) {
15763 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15764 AMDGPU::OpName::scale_src1);
15765 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15766 TII->usesConstantBus(MRI, MI, Src1Idx))
15767 TII->legalizeOpWithMove(MI, Src1Idx);
15768 }
15769 }
15770
15771 if (!HasAGPRs)
15772 return;
15773
15774 // Resolve the rest of AV operands to AGPRs.
15775 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15776 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15777 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15778 if (TRI->isVectorSuperClass(RC)) {
15779 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15780 MRI.setRegClass(Src2->getReg(), NewRC);
15781 if (Src2->isTied())
15782 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15783 }
15784 }
15785 }
15786 }
15787
15788 return;
15789 }
15790
15791 if (TII->isImage(MI))
15792 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15793}
15794
15796 uint64_t Val) {
15797 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15798 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15799}
15800
15802 const SDLoc &DL,
15803 SDValue Ptr) const {
15805
15806 // Build the half of the subregister with the constants before building the
15807 // full 128-bit register. If we are building multiple resource descriptors,
15808 // this will allow CSEing of the 2-component register.
15809 const SDValue Ops0[] = {
15810 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15811 buildSMovImm32(DAG, DL, 0),
15812 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15813 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15814 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15815
15816 SDValue SubRegHi = SDValue(
15817 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15818
15819 // Combine the constants and the pointer.
15820 const SDValue Ops1[] = {
15821 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15822 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15823 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15824
15825 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15826}
15827
15828/// Return a resource descriptor with the 'Add TID' bit enabled
15829/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15830/// of the resource descriptor) to create an offset, which is added to
15831/// the resource pointer.
15833 SDValue Ptr, uint32_t RsrcDword1,
15834 uint64_t RsrcDword2And3) const {
15835 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15836 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15837 if (RsrcDword1) {
15838 PtrHi =
15839 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15840 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15841 0);
15842 }
15843
15844 SDValue DataLo =
15845 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15846 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15847
15848 const SDValue Ops[] = {
15849 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15850 PtrLo,
15851 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15852 PtrHi,
15853 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15854 DataLo,
15855 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15856 DataHi,
15857 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15858
15859 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15860}
15861
15862//===----------------------------------------------------------------------===//
15863// SI Inline Assembly Support
15864//===----------------------------------------------------------------------===//
15865
15866std::pair<unsigned, const TargetRegisterClass *>
15868 StringRef Constraint,
15869 MVT VT) const {
15870 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15871
15872 const TargetRegisterClass *RC = nullptr;
15873 if (Constraint.size() == 1) {
15874 const unsigned BitWidth = VT.getSizeInBits();
15875 switch (Constraint[0]) {
15876 default:
15877 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15878 case 's':
15879 case 'r':
15880 switch (BitWidth) {
15881 case 16:
15882 RC = &AMDGPU::SReg_32RegClass;
15883 break;
15884 case 64:
15885 RC = &AMDGPU::SGPR_64RegClass;
15886 break;
15887 default:
15889 if (!RC)
15890 return std::pair(0U, nullptr);
15891 break;
15892 }
15893 break;
15894 case 'v':
15895 switch (BitWidth) {
15896 case 16:
15897 RC = &AMDGPU::VGPR_32RegClass;
15898 break;
15899 default:
15900 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15901 if (!RC)
15902 return std::pair(0U, nullptr);
15903 break;
15904 }
15905 break;
15906 case 'a':
15907 if (!Subtarget->hasMAIInsts())
15908 break;
15909 switch (BitWidth) {
15910 case 16:
15911 RC = &AMDGPU::AGPR_32RegClass;
15912 break;
15913 default:
15914 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15915 if (!RC)
15916 return std::pair(0U, nullptr);
15917 break;
15918 }
15919 break;
15920 }
15921 // We actually support i128, i16 and f16 as inline parameters
15922 // even if they are not reported as legal
15923 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15924 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15925 return std::pair(0U, RC);
15926 }
15927
15928 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15929 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15930 if (RegName.consume_front("v")) {
15931 RC = &AMDGPU::VGPR_32RegClass;
15932 } else if (RegName.consume_front("s")) {
15933 RC = &AMDGPU::SGPR_32RegClass;
15934 } else if (RegName.consume_front("a")) {
15935 RC = &AMDGPU::AGPR_32RegClass;
15936 }
15937
15938 if (RC) {
15939 uint32_t Idx;
15940 if (RegName.consume_front("[")) {
15941 uint32_t End;
15942 bool Failed = RegName.consumeInteger(10, Idx);
15943 Failed |= !RegName.consume_front(":");
15944 Failed |= RegName.consumeInteger(10, End);
15945 Failed |= !RegName.consume_back("]");
15946 if (!Failed) {
15947 uint32_t Width = (End - Idx + 1) * 32;
15948 // Prohibit constraints for register ranges with a width that does not
15949 // match the required type.
15950 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15951 return std::pair(0U, nullptr);
15952 MCRegister Reg = RC->getRegister(Idx);
15954 RC = TRI->getVGPRClassForBitWidth(Width);
15955 else if (SIRegisterInfo::isSGPRClass(RC))
15956 RC = TRI->getSGPRClassForBitWidth(Width);
15957 else if (SIRegisterInfo::isAGPRClass(RC))
15958 RC = TRI->getAGPRClassForBitWidth(Width);
15959 if (RC) {
15960 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15961 if (!Reg) {
15962 // The register class does not contain the requested register,
15963 // e.g., because it is an SGPR pair that would violate alignment
15964 // requirements.
15965 return std::pair(0U, nullptr);
15966 }
15967 return std::pair(Reg, RC);
15968 }
15969 }
15970 } else {
15971 // Check for lossy scalar/vector conversions.
15972 if (VT.isVector() && VT.getSizeInBits() != 32)
15973 return std::pair(0U, nullptr);
15974 bool Failed = RegName.getAsInteger(10, Idx);
15975 if (!Failed && Idx < RC->getNumRegs())
15976 return std::pair(RC->getRegister(Idx), RC);
15977 }
15978 }
15979 }
15980
15981 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15982 if (Ret.first)
15983 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15984
15985 return Ret;
15986}
15987
15988static bool isImmConstraint(StringRef Constraint) {
15989 if (Constraint.size() == 1) {
15990 switch (Constraint[0]) {
15991 default:
15992 break;
15993 case 'I':
15994 case 'J':
15995 case 'A':
15996 case 'B':
15997 case 'C':
15998 return true;
15999 }
16000 } else if (Constraint == "DA" || Constraint == "DB") {
16001 return true;
16002 }
16003 return false;
16004}
16005
16008 if (Constraint.size() == 1) {
16009 switch (Constraint[0]) {
16010 default:
16011 break;
16012 case 's':
16013 case 'v':
16014 case 'a':
16015 return C_RegisterClass;
16016 }
16017 }
16018 if (isImmConstraint(Constraint)) {
16019 return C_Other;
16020 }
16021 return TargetLowering::getConstraintType(Constraint);
16022}
16023
16024static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
16026 Val = Val & maskTrailingOnes<uint64_t>(Size);
16027 }
16028 return Val;
16029}
16030
16032 StringRef Constraint,
16033 std::vector<SDValue> &Ops,
16034 SelectionDAG &DAG) const {
16035 if (isImmConstraint(Constraint)) {
16036 uint64_t Val;
16037 if (getAsmOperandConstVal(Op, Val) &&
16038 checkAsmConstraintVal(Op, Constraint, Val)) {
16039 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
16040 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
16041 }
16042 } else {
16043 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16044 }
16045}
16046
16048 unsigned Size = Op.getScalarValueSizeInBits();
16049 if (Size > 64)
16050 return false;
16051
16052 if (Size == 16 && !Subtarget->has16BitInsts())
16053 return false;
16054
16055 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16056 Val = C->getSExtValue();
16057 return true;
16058 }
16059 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
16060 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16061 return true;
16062 }
16063 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
16064 if (Size != 16 || Op.getNumOperands() != 2)
16065 return false;
16066 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
16067 return false;
16068 if (ConstantSDNode *C = V->getConstantSplatNode()) {
16069 Val = C->getSExtValue();
16070 return true;
16071 }
16072 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
16073 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16074 return true;
16075 }
16076 }
16077
16078 return false;
16079}
16080
16082 uint64_t Val) const {
16083 if (Constraint.size() == 1) {
16084 switch (Constraint[0]) {
16085 case 'I':
16087 case 'J':
16088 return isInt<16>(Val);
16089 case 'A':
16090 return checkAsmConstraintValA(Op, Val);
16091 case 'B':
16092 return isInt<32>(Val);
16093 case 'C':
16094 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16096 default:
16097 break;
16098 }
16099 } else if (Constraint.size() == 2) {
16100 if (Constraint == "DA") {
16101 int64_t HiBits = static_cast<int32_t>(Val >> 32);
16102 int64_t LoBits = static_cast<int32_t>(Val);
16103 return checkAsmConstraintValA(Op, HiBits, 32) &&
16104 checkAsmConstraintValA(Op, LoBits, 32);
16105 }
16106 if (Constraint == "DB") {
16107 return true;
16108 }
16109 }
16110 llvm_unreachable("Invalid asm constraint");
16111}
16112
16114 unsigned MaxSize) const {
16115 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16116 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16117 if (Size == 16) {
16118 MVT VT = Op.getSimpleValueType();
16119 switch (VT.SimpleTy) {
16120 default:
16121 return false;
16122 case MVT::i16:
16123 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16124 case MVT::f16:
16125 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16126 case MVT::bf16:
16127 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16128 case MVT::v2i16:
16129 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16130 case MVT::v2f16:
16131 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16132 case MVT::v2bf16:
16133 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16134 }
16135 }
16136 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16137 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16138 return true;
16139 return false;
16140}
16141
16142static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16143 switch (UnalignedClassID) {
16144 case AMDGPU::VReg_64RegClassID:
16145 return AMDGPU::VReg_64_Align2RegClassID;
16146 case AMDGPU::VReg_96RegClassID:
16147 return AMDGPU::VReg_96_Align2RegClassID;
16148 case AMDGPU::VReg_128RegClassID:
16149 return AMDGPU::VReg_128_Align2RegClassID;
16150 case AMDGPU::VReg_160RegClassID:
16151 return AMDGPU::VReg_160_Align2RegClassID;
16152 case AMDGPU::VReg_192RegClassID:
16153 return AMDGPU::VReg_192_Align2RegClassID;
16154 case AMDGPU::VReg_224RegClassID:
16155 return AMDGPU::VReg_224_Align2RegClassID;
16156 case AMDGPU::VReg_256RegClassID:
16157 return AMDGPU::VReg_256_Align2RegClassID;
16158 case AMDGPU::VReg_288RegClassID:
16159 return AMDGPU::VReg_288_Align2RegClassID;
16160 case AMDGPU::VReg_320RegClassID:
16161 return AMDGPU::VReg_320_Align2RegClassID;
16162 case AMDGPU::VReg_352RegClassID:
16163 return AMDGPU::VReg_352_Align2RegClassID;
16164 case AMDGPU::VReg_384RegClassID:
16165 return AMDGPU::VReg_384_Align2RegClassID;
16166 case AMDGPU::VReg_512RegClassID:
16167 return AMDGPU::VReg_512_Align2RegClassID;
16168 case AMDGPU::VReg_1024RegClassID:
16169 return AMDGPU::VReg_1024_Align2RegClassID;
16170 case AMDGPU::AReg_64RegClassID:
16171 return AMDGPU::AReg_64_Align2RegClassID;
16172 case AMDGPU::AReg_96RegClassID:
16173 return AMDGPU::AReg_96_Align2RegClassID;
16174 case AMDGPU::AReg_128RegClassID:
16175 return AMDGPU::AReg_128_Align2RegClassID;
16176 case AMDGPU::AReg_160RegClassID:
16177 return AMDGPU::AReg_160_Align2RegClassID;
16178 case AMDGPU::AReg_192RegClassID:
16179 return AMDGPU::AReg_192_Align2RegClassID;
16180 case AMDGPU::AReg_256RegClassID:
16181 return AMDGPU::AReg_256_Align2RegClassID;
16182 case AMDGPU::AReg_512RegClassID:
16183 return AMDGPU::AReg_512_Align2RegClassID;
16184 case AMDGPU::AReg_1024RegClassID:
16185 return AMDGPU::AReg_1024_Align2RegClassID;
16186 default:
16187 return -1;
16188 }
16189}
16190
16191// Figure out which registers should be reserved for stack access. Only after
16192// the function is legalized do we know all of the non-spill stack objects or if
16193// calls are present.
16197 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16198 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16199 const SIInstrInfo *TII = ST.getInstrInfo();
16200
16201 if (Info->isEntryFunction()) {
16202 // Callable functions have fixed registers used for stack access.
16204 }
16205
16206 // TODO: Move this logic to getReservedRegs()
16207 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16208 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16209 Register SReg = ST.isWave32()
16210 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16211 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16212 &AMDGPU::SGPR_64RegClass);
16213 Info->setSGPRForEXECCopy(SReg);
16214
16215 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16216 Info->getStackPtrOffsetReg()));
16217 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16218 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16219
16220 // We need to worry about replacing the default register with itself in case
16221 // of MIR testcases missing the MFI.
16222 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16223 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16224
16225 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16226 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16227
16228 Info->limitOccupancy(MF);
16229
16230 if (ST.isWave32() && !MF.empty()) {
16231 for (auto &MBB : MF) {
16232 for (auto &MI : MBB) {
16233 TII->fixImplicitOperands(MI);
16234 }
16235 }
16236 }
16237
16238 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16239 // classes if required. Ideally the register class constraints would differ
16240 // per-subtarget, but there's no easy way to achieve that right now. This is
16241 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16242 // from using them as the register class for legal types.
16243 if (ST.needsAlignedVGPRs()) {
16244 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16245 const Register Reg = Register::index2VirtReg(I);
16246 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16247 if (!RC)
16248 continue;
16249 int NewClassID = getAlignedAGPRClassID(RC->getID());
16250 if (NewClassID != -1)
16251 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16252 }
16253 }
16254
16256}
16257
16259 KnownBits &Known,
16260 const APInt &DemandedElts,
16261 const SelectionDAG &DAG,
16262 unsigned Depth) const {
16263 Known.resetAll();
16264 unsigned Opc = Op.getOpcode();
16265 switch (Opc) {
16267 unsigned IID = Op.getConstantOperandVal(0);
16268 switch (IID) {
16269 case Intrinsic::amdgcn_mbcnt_lo:
16270 case Intrinsic::amdgcn_mbcnt_hi: {
16271 const GCNSubtarget &ST =
16273 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16274 // most 31 + src1.
16275 Known.Zero.setBitsFrom(
16276 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16277 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16278 Known = KnownBits::add(Known, Known2);
16279 return;
16280 }
16281 }
16282 break;
16283 }
16284 }
16286 Op, Known, DemandedElts, DAG, Depth);
16287}
16288
16290 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16292
16293 // Set the high bits to zero based on the maximum allowed scratch size per
16294 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16295 // calculation won't overflow, so assume the sign bit is never set.
16296 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16297}
16298
16300 KnownBits &Known, unsigned Dim) {
16301 unsigned MaxValue =
16302 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16303 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16304}
16305
16307 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16308 const MachineRegisterInfo &MRI, unsigned Depth) const {
16309 const MachineInstr *MI = MRI.getVRegDef(R);
16310 switch (MI->getOpcode()) {
16311 case AMDGPU::G_INTRINSIC:
16312 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16313 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16314 switch (IID) {
16315 case Intrinsic::amdgcn_workitem_id_x:
16316 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16317 break;
16318 case Intrinsic::amdgcn_workitem_id_y:
16319 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16320 break;
16321 case Intrinsic::amdgcn_workitem_id_z:
16322 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16323 break;
16324 case Intrinsic::amdgcn_mbcnt_lo:
16325 case Intrinsic::amdgcn_mbcnt_hi: {
16326 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16327 // most 31 + src1.
16328 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16329 ? getSubtarget()->getWavefrontSizeLog2()
16330 : 5);
16331 KnownBits Known2;
16332 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16333 Depth + 1);
16334 Known = KnownBits::add(Known, Known2);
16335 break;
16336 }
16337 case Intrinsic::amdgcn_groupstaticsize: {
16338 // We can report everything over the maximum size as 0. We can't report
16339 // based on the actual size because we don't know if it's accurate or not
16340 // at any given point.
16341 Known.Zero.setHighBits(
16342 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16343 break;
16344 }
16345 }
16346 break;
16347 }
16348 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16349 Known.Zero.setHighBits(24);
16350 break;
16351 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16352 Known.Zero.setHighBits(16);
16353 break;
16354 case AMDGPU::G_AMDGPU_SMED3:
16355 case AMDGPU::G_AMDGPU_UMED3: {
16356 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16357
16358 KnownBits Known2;
16359 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16360 if (Known2.isUnknown())
16361 break;
16362
16363 KnownBits Known1;
16364 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16365 if (Known1.isUnknown())
16366 break;
16367
16368 KnownBits Known0;
16369 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16370 if (Known0.isUnknown())
16371 break;
16372
16373 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16374 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16375 Known.One = Known0.One & Known1.One & Known2.One;
16376 break;
16377 }
16378 }
16379}
16380
16383 unsigned Depth) const {
16384 const MachineInstr *MI = MRI.getVRegDef(R);
16385 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16386 // FIXME: Can this move to generic code? What about the case where the call
16387 // site specifies a lower alignment?
16388 Intrinsic::ID IID = GI->getIntrinsicID();
16390 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16391 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16392 return *RetAlign;
16393 }
16394 return Align(1);
16395}
16396
16399 const Align CacheLineAlign = Align(64);
16400
16401 // Pre-GFX10 target did not benefit from loop alignment
16402 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16403 getSubtarget()->hasInstFwdPrefetchBug())
16404 return PrefAlign;
16405
16406 // On GFX10 I$ is 4 x 64 bytes cache lines.
16407 // By default prefetcher keeps one cache line behind and reads two ahead.
16408 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16409 // behind and one ahead.
16410 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16411 // If loop fits 64 bytes it always spans no more than two cache lines and
16412 // does not need an alignment.
16413 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16414 // Else if loop is less or equal 192 bytes we need two lines behind.
16415
16417 const MachineBasicBlock *Header = ML->getHeader();
16418 if (Header->getAlignment() != PrefAlign)
16419 return Header->getAlignment(); // Already processed.
16420
16421 unsigned LoopSize = 0;
16422 for (const MachineBasicBlock *MBB : ML->blocks()) {
16423 // If inner loop block is aligned assume in average half of the alignment
16424 // size to be added as nops.
16425 if (MBB != Header)
16426 LoopSize += MBB->getAlignment().value() / 2;
16427
16428 for (const MachineInstr &MI : *MBB) {
16429 LoopSize += TII->getInstSizeInBytes(MI);
16430 if (LoopSize > 192)
16431 return PrefAlign;
16432 }
16433 }
16434
16435 if (LoopSize <= 64)
16436 return PrefAlign;
16437
16438 if (LoopSize <= 128)
16439 return CacheLineAlign;
16440
16441 // If any of parent loops is surrounded by prefetch instructions do not
16442 // insert new for inner loop, which would reset parent's settings.
16443 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16444 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16445 auto I = Exit->getFirstNonDebugInstr();
16446 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16447 return CacheLineAlign;
16448 }
16449 }
16450
16451 MachineBasicBlock *Pre = ML->getLoopPreheader();
16452 MachineBasicBlock *Exit = ML->getExitBlock();
16453
16454 if (Pre && Exit) {
16455 auto PreTerm = Pre->getFirstTerminator();
16456 if (PreTerm == Pre->begin() ||
16457 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16458 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16459 .addImm(1); // prefetch 2 lines behind PC
16460
16461 auto ExitHead = Exit->getFirstNonDebugInstr();
16462 if (ExitHead == Exit->end() ||
16463 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16464 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16465 .addImm(2); // prefetch 1 line behind PC
16466 }
16467
16468 return CacheLineAlign;
16469}
16470
16472static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16473 assert(N->getOpcode() == ISD::CopyFromReg);
16474 do {
16475 // Follow the chain until we find an INLINEASM node.
16476 N = N->getOperand(0).getNode();
16477 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16478 return true;
16479 } while (N->getOpcode() == ISD::CopyFromReg);
16480 return false;
16481}
16482
16485 UniformityInfo *UA) const {
16486 switch (N->getOpcode()) {
16487 case ISD::CopyFromReg: {
16488 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16489 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16490 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16491 Register Reg = R->getReg();
16492
16493 // FIXME: Why does this need to consider isLiveIn?
16494 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16495 return !TRI->isSGPRReg(MRI, Reg);
16496
16497 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16498 return UA->isDivergent(V);
16499
16500 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16501 return !TRI->isSGPRReg(MRI, Reg);
16502 }
16503 case ISD::LOAD: {
16504 const LoadSDNode *L = cast<LoadSDNode>(N);
16505 unsigned AS = L->getAddressSpace();
16506 // A flat load may access private memory.
16508 }
16509 case ISD::CALLSEQ_END:
16510 return true;
16512 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16514 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16533 // Target-specific read-modify-write atomics are sources of divergence.
16534 return true;
16535 default:
16536 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16537 // Generic read-modify-write atomics are sources of divergence.
16538 return A->readMem() && A->writeMem();
16539 }
16540 return false;
16541 }
16542}
16543
16545 EVT VT) const {
16546 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16547 case MVT::f32:
16549 case MVT::f64:
16550 case MVT::f16:
16552 default:
16553 return false;
16554 }
16555}
16556
16558 LLT Ty, const MachineFunction &MF) const {
16559 switch (Ty.getScalarSizeInBits()) {
16560 case 32:
16561 return !denormalModeIsFlushAllF32(MF);
16562 case 64:
16563 case 16:
16564 return !denormalModeIsFlushAllF64F16(MF);
16565 default:
16566 return false;
16567 }
16568}
16569
16571 const SelectionDAG &DAG,
16572 bool SNaN,
16573 unsigned Depth) const {
16574 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16575 const MachineFunction &MF = DAG.getMachineFunction();
16577
16578 if (Info->getMode().DX10Clamp)
16579 return true; // Clamped to 0.
16580 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16581 }
16582
16584 Depth);
16585}
16586
16587// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16588// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16590 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16591 return true;
16592
16594 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16595 if (DenormMode == DenormalMode::getPreserveSign())
16596 return true;
16597
16598 // TODO: Remove this.
16599 return RMW->getFunction()
16600 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16601 .getValueAsBool();
16602}
16603
16605 LLVMContext &Ctx = RMW->getContext();
16606 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16607 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16608
16609 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16610 << "Hardware instruction generated for atomic "
16611 << RMW->getOperationName(RMW->getOperation())
16612 << " operation at memory scope " << MemScope;
16613}
16614
16615static bool isV2F16OrV2BF16(Type *Ty) {
16616 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16617 Type *EltTy = VT->getElementType();
16618 return VT->getNumElements() == 2 &&
16619 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16620 }
16621
16622 return false;
16623}
16624
16625static bool isV2F16(Type *Ty) {
16626 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16627 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16628}
16629
16630static bool isV2BF16(Type *Ty) {
16631 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16632 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16633}
16634
16635/// \return true if atomicrmw integer ops work for the type.
16636static bool isAtomicRMWLegalIntTy(Type *Ty) {
16637 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16638 unsigned BW = IT->getBitWidth();
16639 return BW == 32 || BW == 64;
16640 }
16641
16642 return false;
16643}
16644
16645/// \return true if this atomicrmw xchg type can be selected.
16646static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16647 Type *Ty = RMW->getType();
16648 if (isAtomicRMWLegalIntTy(Ty))
16649 return true;
16650
16651 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16652 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16653 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16654 return BW == 32 || BW == 64;
16655 }
16656
16657 if (Ty->isFloatTy() || Ty->isDoubleTy())
16658 return true;
16659
16660 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16661 return VT->getNumElements() == 2 &&
16662 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16663 }
16664
16665 return false;
16666}
16667
16668/// \returns true if it's valid to emit a native instruction for \p RMW, based
16669/// on the properties of the target memory.
16670static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16671 const AtomicRMWInst *RMW,
16672 bool HasSystemScope) {
16673 // The remote/fine-grained access logic is different from the integer
16674 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16675 // fine-grained access does not work, even for a device local allocation.
16676 //
16677 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16678 // allocations work.
16679 if (HasSystemScope) {
16681 RMW->hasMetadata("amdgpu.no.remote.memory"))
16682 return true;
16684 return true;
16685
16686 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16687}
16688
16689/// \return Action to perform on AtomicRMWInsts for integer operations.
16692 return isAtomicRMWLegalIntTy(RMW->getType())
16695}
16696
16697/// Return if a flat address space atomicrmw can access private memory.
16699 const MDNode *NoaliasAddrSpaceMD =
16700 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16701 if (!NoaliasAddrSpaceMD)
16702 return true;
16703
16704 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16705 ++I) {
16706 auto *Low = mdconst::extract<ConstantInt>(
16707 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16708 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16709 auto *High = mdconst::extract<ConstantInt>(
16710 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16711 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16712 }
16713 }
16714
16715 return true;
16716}
16717
16720 unsigned AS = RMW->getPointerAddressSpace();
16721 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16723
16724 // 64-bit flat atomics that dynamically reside in private memory will silently
16725 // be dropped.
16726 //
16727 // Note that we will emit a new copy of the original atomic in the expansion,
16728 // which will be incrementally relegalized.
16729 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16730 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16731 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16734
16735 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16737 ORE.emit([=]() {
16738 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16739 });
16740 return Kind;
16741 };
16742
16743 auto SSID = RMW->getSyncScopeID();
16744 bool HasSystemScope =
16745 SSID == SyncScope::System ||
16746 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16747
16748 auto Op = RMW->getOperation();
16749 switch (Op) {
16750 case AtomicRMWInst::Xchg: {
16751 // PCIe supports add and xchg for system atomics.
16752 return isAtomicRMWLegalXChgTy(RMW)
16755 }
16756 case AtomicRMWInst::Add:
16757 case AtomicRMWInst::And:
16761 case AtomicRMWInst::Sub:
16762 case AtomicRMWInst::Or:
16763 case AtomicRMWInst::Xor: {
16764 // Atomic sub/or/xor do not work over PCI express, but atomic add
16765 // does. InstCombine transforms these with 0 to or, so undo that.
16766 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16767 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16768 ConstVal && ConstVal->isNullValue())
16770 }
16771
16773 }
16774 case AtomicRMWInst::FAdd: {
16775 Type *Ty = RMW->getType();
16776
16777 // TODO: Handle REGION_ADDRESS
16778 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16779 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16780 // is fixed to round-to-nearest-even.
16781 //
16782 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16783 // round-to-nearest-even.
16784 //
16785 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16786 // suggests it is OK if the floating-point mode may not match the calling
16787 // thread.
16788 if (Ty->isFloatTy()) {
16791 }
16792
16793 if (Ty->isDoubleTy()) {
16794 // Ignores denormal mode, but we don't consider flushing mandatory.
16797 }
16798
16799 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16801
16803 }
16804
16805 // LDS atomics respect the denormal mode from the mode register.
16806 //
16807 // Traditionally f32 global/buffer memory atomics would unconditionally
16808 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16809 // flush.
16810 //
16811 // On targets with flat atomic fadd, denormals would flush depending on
16812 // whether the target address resides in LDS or global memory. We consider
16813 // this flat-maybe-flush as will-flush.
16814 if (Ty->isFloatTy() &&
16818
16819 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16820 // safe. The message phrasing also should be better.
16821 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16822 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16823 // gfx940, gfx12
16824 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16825 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16826 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16827 // gfx90a, gfx940, gfx12
16828 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16829 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16830
16831 // gfx940, gfx12
16832 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16833 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16834 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16835 // gfx90a, gfx940, gfx12
16836 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16837 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16838
16839 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16840 // buffer. gfx12 does have the buffer version.
16841 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16842 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16843 }
16844
16845 // global and flat atomic fadd f64: gfx90a, gfx940.
16846 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16847 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16848
16849 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16850 if (Ty->isFloatTy()) {
16851 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16852 // gfx11+.
16853 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16854 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16855 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16856 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16857 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16858 } else {
16859 // gfx908
16860 if (RMW->use_empty() &&
16862 isV2F16(Ty))
16863 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16864 }
16865 }
16866
16867 // flat atomic fadd f32: gfx940, gfx11+.
16868 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16869 if (Subtarget->hasFlatAtomicFaddF32Inst())
16870 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16871
16872 // If it is in flat address space, and the type is float, we will try to
16873 // expand it, if the target supports global and lds atomic fadd. The
16874 // reason we need that is, in the expansion, we emit the check of
16875 // address space. If it is in global address space, we emit the global
16876 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16877 // fadd.
16878 if (Subtarget->hasLDSFPAtomicAddF32()) {
16879 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16881 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16883 }
16884 }
16885 }
16886
16888 }
16890 case AtomicRMWInst::FMax: {
16891 Type *Ty = RMW->getType();
16892
16893 // LDS float and double fmin/fmax were always supported.
16894 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16895 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16897 }
16898
16899 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16900 // For flat and global cases:
16901 // float, double in gfx7. Manual claims denormal support.
16902 // Removed in gfx8.
16903 // float, double restored in gfx10.
16904 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16905 //
16906 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16907 // no f32.
16908 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16909 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16910 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16911 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16912 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16913 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16915 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16916 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16917 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16918 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16919 }
16920 }
16921
16923 }
16924 case AtomicRMWInst::Min:
16925 case AtomicRMWInst::Max:
16927 case AtomicRMWInst::UMax: {
16930 // Always expand system scope min/max atomics.
16931 if (HasSystemScope)
16933 }
16934
16936 }
16939 default:
16941 }
16942
16943 llvm_unreachable("covered atomicrmw op switch");
16944}
16945
16951}
16952
16955 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16958}
16959
16962 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16963 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16965
16966 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16968
16969 const DataLayout &DL = CmpX->getDataLayout();
16970
16971 Type *ValTy = CmpX->getNewValOperand()->getType();
16972
16973 // If a 64-bit flat atomic may alias private, we need to avoid using the
16974 // atomic in the private case.
16975 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16977}
16978
16979const TargetRegisterClass *
16980SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16982 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16983 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16984 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16985 : &AMDGPU::SReg_32RegClass;
16986 if (!TRI->isSGPRClass(RC) && !isDivergent)
16987 return TRI->getEquivalentSGPRClass(RC);
16988 if (TRI->isSGPRClass(RC) && isDivergent)
16989 return TRI->getEquivalentVGPRClass(RC);
16990
16991 return RC;
16992}
16993
16994// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16995// uniform values (as produced by the mask results of control flow intrinsics)
16996// used outside of divergent blocks. The phi users need to also be treated as
16997// always uniform.
16998//
16999// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
17000static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
17001 unsigned WaveSize) {
17002 // FIXME: We assume we never cast the mask results of a control flow
17003 // intrinsic.
17004 // Early exit if the type won't be consistent as a compile time hack.
17005 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
17006 if (!IT || IT->getBitWidth() != WaveSize)
17007 return false;
17008
17009 if (!isa<Instruction>(V))
17010 return false;
17011 if (!Visited.insert(V).second)
17012 return false;
17013 bool Result = false;
17014 for (const auto *U : V->users()) {
17015 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
17016 if (V == U->getOperand(1)) {
17017 switch (Intrinsic->getIntrinsicID()) {
17018 default:
17019 Result = false;
17020 break;
17021 case Intrinsic::amdgcn_if_break:
17022 case Intrinsic::amdgcn_if:
17023 case Intrinsic::amdgcn_else:
17024 Result = true;
17025 break;
17026 }
17027 }
17028 if (V == U->getOperand(0)) {
17029 switch (Intrinsic->getIntrinsicID()) {
17030 default:
17031 Result = false;
17032 break;
17033 case Intrinsic::amdgcn_end_cf:
17034 case Intrinsic::amdgcn_loop:
17035 Result = true;
17036 break;
17037 }
17038 }
17039 } else {
17040 Result = hasCFUser(U, Visited, WaveSize);
17041 }
17042 if (Result)
17043 break;
17044 }
17045 return Result;
17046}
17047
17049 const Value *V) const {
17050 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
17051 if (CI->isInlineAsm()) {
17052 // FIXME: This cannot give a correct answer. This should only trigger in
17053 // the case where inline asm returns mixed SGPR and VGPR results, used
17054 // outside the defining block. We don't have a specific result to
17055 // consider, so this assumes if any value is SGPR, the overall register
17056 // also needs to be SGPR.
17057 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
17059 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
17060 for (auto &TC : TargetConstraints) {
17061 if (TC.Type == InlineAsm::isOutput) {
17063 const TargetRegisterClass *RC =
17064 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
17065 TC.ConstraintVT)
17066 .second;
17067 if (RC && SIRI->isSGPRClass(RC))
17068 return true;
17069 }
17070 }
17071 }
17072 }
17074 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
17075}
17076
17078 for (SDUse &Use : N->uses()) {
17079 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
17080 if (getBasePtrIndex(M) == Use.getOperandNo())
17081 return true;
17082 }
17083 }
17084 return false;
17085}
17086
17088 SDValue N1) const {
17089 if (!N0.hasOneUse())
17090 return false;
17091 // Take care of the opportunity to keep N0 uniform
17092 if (N0->isDivergent() || !N1->isDivergent())
17093 return true;
17094 // Check if we have a good chance to form the memory access pattern with the
17095 // base and offset
17096 return (DAG.isBaseWithConstantOffset(N0) &&
17098}
17099
17101 Register N0, Register N1) const {
17102 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17103}
17104
17107 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17109 if (I.getMetadata("amdgpu.noclobber"))
17110 Flags |= MONoClobber;
17111 if (I.getMetadata("amdgpu.last.use"))
17112 Flags |= MOLastUse;
17113 return Flags;
17114}
17115
17117 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17118 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
17119 if (User->getOpcode() != ISD::CopyToReg)
17120 return false;
17121 if (!Def->isMachineOpcode())
17122 return false;
17123 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17124 if (!MDef)
17125 return false;
17126
17127 unsigned ResNo = User->getOperand(Op).getResNo();
17128 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17129 return false;
17130 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17131 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17132 PhysReg = AMDGPU::SCC;
17133 const TargetRegisterClass *RC =
17134 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17135 Cost = RC->getCopyCost();
17136 return true;
17137 }
17138 return false;
17139}
17140
17141/// Check if it is profitable to hoist instruction in then/else to if.
17143 if (!I->hasOneUse())
17144 return true;
17145
17146 Instruction *User = I->user_back();
17147 // TODO: Add more patterns that are not profitable to hoist and
17148 // handle modifiers such as fabs and fneg
17149 switch (I->getOpcode()) {
17150 case Instruction::FMul: {
17151 if (User->getOpcode() != Instruction::FSub &&
17152 User->getOpcode() != Instruction::FAdd)
17153 return true;
17154
17156
17157 return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17158 Options.AllowFPOpFusion != FPOpFusion::Fast &&
17159 !Options.UnsafeFPMath) ||
17160 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17161 }
17162 default:
17163 return true;
17164 }
17165 return true;
17166}
17167
17169 Instruction *AI) const {
17170 // Given: atomicrmw fadd ptr %addr, float %val ordering
17171 //
17172 // With this expansion we produce the following code:
17173 // [...]
17174 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17175 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17176 //
17177 // atomicrmw.shared:
17178 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17179 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17180 // float %val ordering
17181 // br label %atomicrmw.phi
17182 //
17183 // atomicrmw.check.private:
17184 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17185 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17186 //
17187 // atomicrmw.private:
17188 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17189 // %loaded.private = load float, ptr addrspace(5) %cast.private
17190 // %val.new = fadd float %loaded.private, %val
17191 // store float %val.new, ptr addrspace(5) %cast.private
17192 // br label %atomicrmw.phi
17193 //
17194 // atomicrmw.global:
17195 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17196 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17197 // float %val ordering
17198 // br label %atomicrmw.phi
17199 //
17200 // atomicrmw.phi:
17201 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17202 // [ %loaded.private, %atomicrmw.private ],
17203 // [ %loaded.global, %atomicrmw.global ]
17204 // br label %atomicrmw.end
17205 //
17206 // atomicrmw.end:
17207 // [...]
17208 //
17209 //
17210 // For 64-bit atomics which may reside in private memory, we perform a simpler
17211 // version that only inserts the private check, and uses the flat operation.
17212
17213 IRBuilder<> Builder(AI);
17214 LLVMContext &Ctx = Builder.getContext();
17215
17216 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17217 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17219 Value *Addr = AI->getOperand(PtrOpIdx);
17220
17221 /// TODO: Only need to check private, then emit flat-known-not private (no
17222 /// need for shared block, or cast to global).
17223 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17224
17225 Align Alignment;
17226 if (RMW)
17227 Alignment = RMW->getAlign();
17228 else if (CX)
17229 Alignment = CX->getAlign();
17230 else
17231 llvm_unreachable("unhandled atomic operation");
17232
17233 // FullFlatEmulation is true if we need to issue the private, shared, and
17234 // global cases.
17235 //
17236 // If this is false, we are only dealing with the flat-targeting-private case,
17237 // where we only insert a check for private and still use the flat instruction
17238 // for global and shared.
17239
17240 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17241 Subtarget->hasAtomicFaddInsts() &&
17242 RMW->getType()->isFloatTy();
17243
17244 // If the return value isn't used, do not introduce a false use in the phi.
17245 bool ReturnValueIsUsed = !AI->use_empty();
17246
17247 BasicBlock *BB = Builder.GetInsertBlock();
17248 Function *F = BB->getParent();
17249 BasicBlock *ExitBB =
17250 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17251 BasicBlock *SharedBB = nullptr;
17252
17253 BasicBlock *CheckPrivateBB = BB;
17254 if (FullFlatEmulation) {
17255 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17256 CheckPrivateBB =
17257 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17258 }
17259
17260 BasicBlock *PrivateBB =
17261 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17262 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17263 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17264
17265 std::prev(BB->end())->eraseFromParent();
17266 Builder.SetInsertPoint(BB);
17267
17268 Value *LoadedShared = nullptr;
17269 if (FullFlatEmulation) {
17270 CallInst *IsShared = Builder.CreateIntrinsic(
17271 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17272 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17273 Builder.SetInsertPoint(SharedBB);
17274 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17276
17277 Instruction *Clone = AI->clone();
17278 Clone->insertInto(SharedBB, SharedBB->end());
17279 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17280 LoadedShared = Clone;
17281
17282 Builder.CreateBr(PhiBB);
17283 Builder.SetInsertPoint(CheckPrivateBB);
17284 }
17285
17286 CallInst *IsPrivate = Builder.CreateIntrinsic(
17287 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17288 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17289
17290 Builder.SetInsertPoint(PrivateBB);
17291
17292 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17294
17295 Value *LoadedPrivate;
17296 if (RMW) {
17297 LoadedPrivate = Builder.CreateAlignedLoad(
17298 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17299
17300 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17301 LoadedPrivate, RMW->getValOperand());
17302
17303 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17304 } else {
17305 auto [ResultLoad, Equal] =
17306 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17307 CX->getNewValOperand(), CX->getAlign());
17308
17309 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17310 ResultLoad, 0);
17311 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17312 }
17313
17314 Builder.CreateBr(PhiBB);
17315
17316 Builder.SetInsertPoint(GlobalBB);
17317
17318 // Continue using a flat instruction if we only emitted the check for private.
17319 Instruction *LoadedGlobal = AI;
17320 if (FullFlatEmulation) {
17321 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17323 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17324 }
17325
17326 AI->removeFromParent();
17327 AI->insertInto(GlobalBB, GlobalBB->end());
17328
17329 // The new atomicrmw may go through another round of legalization later.
17330 if (!FullFlatEmulation) {
17331 // We inserted the runtime check already, make sure we do not try to
17332 // re-expand this.
17333 // TODO: Should union with any existing metadata.
17334 MDBuilder MDB(F->getContext());
17335 MDNode *RangeNotPrivate =
17338 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17339 RangeNotPrivate);
17340 }
17341
17342 Builder.CreateBr(PhiBB);
17343
17344 Builder.SetInsertPoint(PhiBB);
17345
17346 if (ReturnValueIsUsed) {
17347 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17348 AI->replaceAllUsesWith(Loaded);
17349 if (FullFlatEmulation)
17350 Loaded->addIncoming(LoadedShared, SharedBB);
17351 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17352 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17353 Loaded->takeName(AI);
17354 }
17355
17356 Builder.CreateBr(ExitBB);
17357}
17358
17361
17364 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17365 ConstVal && ConstVal->isNullValue()) {
17366 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17368
17369 // We may still need the private-alias-flat handling below.
17370
17371 // TODO: Skip this for cases where we cannot access remote memory.
17372 }
17373 }
17374
17375 // The non-flat expansions should only perform the de-canonicalization of
17376 // identity values.
17378 return;
17379
17381}
17382
17385}
17386
17387LoadInst *
17389 IRBuilder<> Builder(AI);
17390 auto Order = AI->getOrdering();
17391
17392 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17393 // must be flushed if the atomic ordering had a release semantics. This is
17394 // not necessary a fence, a release fence just coincides to do that flush.
17395 // Avoid replacing of an atomicrmw with a release semantics.
17396 if (isReleaseOrStronger(Order))
17397 return nullptr;
17398
17399 LoadInst *LI = Builder.CreateAlignedLoad(
17400 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17401 LI->setAtomic(Order, AI->getSyncScopeID());
17402 LI->copyMetadata(*AI);
17403 LI->takeName(AI);
17404 AI->replaceAllUsesWith(LI);
17405 AI->eraseFromParent();
17406 return LI;
17407}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
static bool isUndef(const MachineInstr &MI)
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1122
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
bool isNegative() const
Definition: APFloat.h:1445
APInt bitcastToAPInt() const
Definition: APFloat.h:1351
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
bool isInfinity() const
Definition: APFloat.h:1442
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:464
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:213
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:589
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1451
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
bool hasPkMovB32() const
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:512
unsigned getAddressSpace() const
Definition: GlobalValue.h:206
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
Type * getValueType() const
Definition: GlobalValue.h:297
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LLVMContext & getContext() const
Definition: IRBuilder.h:195
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:385
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:71
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:983
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
const Pass * getPass() const
Definition: SelectionDAG.h:493
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:713
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1494
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1643
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1590
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1612
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:245
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:60
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:236
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals