LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
55 "amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing",
62 cl::desc("Use indirect register addressing for divergent indexes"),
63 cl::init(false));
64
67 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
68}
69
72 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
73}
74
75static unsigned findFirstFreeSGPR(CCState &CCInfo) {
76 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
77 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
79 return AMDGPU::SGPR0 + Reg;
80 }
81 }
82 llvm_unreachable("Cannot allocate sgpr");
83}
84
86 const GCNSubtarget &STI)
88 Subtarget(&STI) {
89 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
90 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
91
92 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
93 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
94
95 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
99
100 addRegisterClass(MVT::f64, V64RegClass);
101 addRegisterClass(MVT::v2f32, V64RegClass);
102 addRegisterClass(MVT::Untyped, V64RegClass);
103
104 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
105 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
106
107 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
108 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
109
110 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
111 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
112
113 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
114 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
115
116 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
117 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
118
119 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
120 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
121
122 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
123 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
124
125 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
126 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
127
128 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
129 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
130
131 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
132 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
133
134 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
135 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
136
137 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
138 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
139
140 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
141 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
142
143 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
144 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
145
146 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
147 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
148
149 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
150 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
151
152 if (Subtarget->has16BitInsts()) {
153 if (Subtarget->useRealTrue16Insts()) {
154 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
156 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
157 } else {
158 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
160 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
161 }
162
163 // Unless there are also VOP3P operations, not operations are really legal.
164 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
170 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
173 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
176 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
178 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
179 }
180
181 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
182 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
183
185
186 // The boolean content concept here is too inflexible. Compares only ever
187 // really produce a 1-bit result. Any copy/extend from these will turn into a
188 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
189 // it's what most targets use.
192
193 // We need to custom lower vector stores from local memory
195 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
196 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
197 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
198 MVT::i1, MVT::v32i32},
199 Custom);
200
202 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
203 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
204 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
205 MVT::i1, MVT::v32i32},
206 Custom);
207
208 if (isTypeLegal(MVT::bf16)) {
209 for (unsigned Opc :
218 ISD::SETCC}) {
219 // FIXME: The promoted to type shouldn't need to be explicit
220 setOperationAction(Opc, MVT::bf16, Promote);
221 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
222 }
223
225
227 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
228
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
237 }
238
239 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
240 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
241 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
242 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
243 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
244 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
245 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
246 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
247 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
248 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
249 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
250 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
251 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
252 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
253 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
254 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
255
256 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
257 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
258 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
261 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
262 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
263
264 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
265
269 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
270
271 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
272
274 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
275
277 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
278 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
279
281 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
282 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
283 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 Expand);
286 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
287 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
288 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
289 Expand);
290
292 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
293 MVT::v3i16, MVT::v4i16, MVT::Other},
294 Custom);
295
298 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
299
301
303
305 Expand);
306
307#if 0
309#endif
310
311 // We only support LOAD/STORE and vector manipulation ops for vectors
312 // with > 4 elements.
313 for (MVT VT :
314 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
315 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
316 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
317 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
318 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
319 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
320 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
321 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
322 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
323 switch (Op) {
324 case ISD::LOAD:
325 case ISD::STORE:
327 case ISD::BITCAST:
328 case ISD::UNDEF:
332 case ISD::IS_FPCLASS:
333 break;
338 break;
339 default:
341 break;
342 }
343 }
344 }
345
347
348 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
349 // is expanded to avoid having two separate loops in case the index is a VGPR.
350
351 // Most operations are naturally 32-bit vector operations. We only support
352 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
353 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
355 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
356
358 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
359
361 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
362
364 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
365 }
366
367 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
369 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
370
372 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
373
375 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
376
378 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
379 }
380
381 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
383 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
384
386 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
387
389 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
390
392 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
393 }
394
395 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
397 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
398
400 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
401
403 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
404
406 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
407 }
408
409 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
411 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
412
414 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
415
417 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
418
420 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
421 }
422
424 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
425 Expand);
426
427 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
428 Custom);
429
430 // Avoid stack access for these.
431 // TODO: Generalize to more vector types.
433 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
434 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
435 Custom);
436
437 // Deal with vec3 vector operations when widened to vec4.
439 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
440
441 // Deal with vec5/6/7 vector operations when widened to vec8.
443 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
444 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
445 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
446 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
447 Custom);
448
449 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
450 // and output demarshalling
451 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
452
453 // We can't return success/failure, only the old value,
454 // let LLVM add the comparison
456 Expand);
457
458 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
459
460 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
461
462 // FIXME: This should be narrowed to i32, but that only happens if i64 is
463 // illegal.
464 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
465 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
466
467 // On SI this is s_memtime and s_memrealtime on VI.
469
470 if (Subtarget->hasSMemRealTime() ||
474
475 if (Subtarget->has16BitInsts()) {
478 } else {
480 }
481
482 if (Subtarget->hasMadMacF32Insts())
484
485 if (!Subtarget->hasBFI())
486 // fcopysign can be done in a single instruction with BFI.
487 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
488
489 if (!Subtarget->hasBCNT(32))
491
492 if (!Subtarget->hasBCNT(64))
494
495 if (Subtarget->hasFFBH())
497
498 if (Subtarget->hasFFBL())
500
501 // We only really have 32-bit BFE instructions (and 16-bit on VI).
502 //
503 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
504 // effort to match them now. We want this to be false for i64 cases when the
505 // extraction isn't restricted to the upper or lower half. Ideally we would
506 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
507 // span the midpoint are probably relatively rare, so don't worry about them
508 // for now.
509 if (Subtarget->hasBFE())
511
512 // Clamp modifier on add/sub
513 if (Subtarget->hasIntClamp())
515
516 if (Subtarget->hasAddNoCarry())
517 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
518 Legal);
519
520 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
521 Custom);
522
523 // These are really only legal for ieee_mode functions. We should be avoiding
524 // them for functions that don't have ieee_mode enabled, so just say they are
525 // legal.
527 {MVT::f32, MVT::f64}, Legal);
528
529 if (Subtarget->haveRoundOpsF64())
531 Legal);
532 else
534 MVT::f64, Custom);
535
537 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
538 Legal);
539 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
540
543
544 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
546
547 // Custom lower these because we can't specify a rule based on an illegal
548 // source bf16.
551
552 if (Subtarget->has16BitInsts()) {
555 MVT::i16, Legal);
556
557 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
558
560 MVT::i16, Expand);
561
565 ISD::CTPOP},
566 MVT::i16, Promote);
567
569
570 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
571
573 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
575 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
576
580
582
583 // F16 - Constant Actions.
586
587 // F16 - Load/Store Actions.
589 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
591 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
592
593 // BF16 - Load/Store Actions.
595 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
597 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
598
599 // F16 - VOP1 Actions.
602 MVT::f16, Custom);
603
606
607 // F16 - VOP2 Actions.
608 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
609 Expand);
613
614 // F16 - VOP3 Actions.
616 if (STI.hasMadF16())
618
619 for (MVT VT :
620 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
621 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
622 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
623 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
624 switch (Op) {
625 case ISD::LOAD:
626 case ISD::STORE:
628 case ISD::BITCAST:
629 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
639 break;
640 default:
642 break;
643 }
644 }
645 }
646
647 // v_perm_b32 can handle either of these.
648 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
650
651 // XXX - Do these do anything? Vector constants turn into build_vector.
652 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
653
654 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
655 Legal);
656
658 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
660 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
661
663 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
665 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
666
667 setOperationAction(ISD::AND, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::OR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
671 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
672 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
673
675 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
677 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
678 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
679 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
680
682 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
686 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
687
689 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
691 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
692 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
694
696 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
698 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
699
701 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
705 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
706
707 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
711 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
713
715 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
717 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
718 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
720
721 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
725 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
726 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
727
729 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
731 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
732 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
734
736 MVT::v2i32, Expand);
738
740 MVT::v4i32, Expand);
741
743 MVT::v8i32, Expand);
744
745 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
746 Subtarget->hasVOP3PInsts() ? Legal : Custom);
747
748 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
749 // This isn't really legal, but this avoids the legalizer unrolling it (and
750 // allows matching fneg (fabs x) patterns)
751 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
752
755
757 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
758 Custom);
759
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
762 Expand);
763
764 for (MVT Vec16 :
765 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
766 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
769 Vec16, Custom);
771 }
772 }
773
774 if (Subtarget->hasVOP3PInsts()) {
778 MVT::v2i16, Legal);
779
782 MVT::v2f16, Legal);
783
784 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
785 Custom);
786
788 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
789 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
790 Custom);
791
792 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
793 // Split vector operations.
798 VT, Custom);
799
800 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
801 // Split vector operations.
803 VT, Custom);
804
805 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
806 Custom);
807
808 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
809 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
810 Custom);
811
812 if (Subtarget->hasPackedFP32Ops()) {
814 MVT::v2f32, Legal);
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
817 Custom);
818 }
819 }
820
822
823 if (Subtarget->has16BitInsts()) {
825 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
827 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
828 } else {
829 // Legalization hack.
830 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
831
833 }
834
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
840 Custom);
841
843
844 if (Subtarget->hasScalarSMulU64())
846
847 if (Subtarget->hasMad64_32())
849
850 if (Subtarget->hasPrefetch())
852
853 if (Subtarget->hasIEEEMinMax()) {
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
858 Custom);
859 }
860
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
864 MVT::i8},
865 Custom);
866
868 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
869 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
870 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
871 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
872 Custom);
873
875 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
876 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
877 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
878 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
879 Custom);
880
886
887 // TODO: Could move this to custom lowering, could benefit from combines on
888 // extract of relevant bits.
890
892
895 ISD::SUB,
897 ISD::FADD,
898 ISD::FSUB,
899 ISD::FDIV,
906 ISD::FMA,
907 ISD::SMIN,
908 ISD::SMAX,
909 ISD::UMIN,
910 ISD::UMAX,
912 ISD::AND,
913 ISD::OR,
914 ISD::XOR,
915 ISD::FSHR,
925
926 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
928
929 // All memory operations. Some folding on the pointer operand is done to help
930 // matching the constant offsets in the addressing modes.
955
956 // FIXME: In other contexts we pretend this is a per-function property.
958
960}
961
963 return Subtarget;
964}
965
967 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
968 return RCRegs;
969}
970
971//===----------------------------------------------------------------------===//
972// TargetLowering queries
973//===----------------------------------------------------------------------===//
974
975// v_mad_mix* support a conversion from f16 to f32.
976//
977// There is only one special case when denormals are enabled we don't currently,
978// where this is OK to use.
979bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
980 EVT DestVT, EVT SrcVT) const {
981 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
982 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
983 DestVT.getScalarType() == MVT::f32 &&
984 SrcVT.getScalarType() == MVT::f16 &&
985 // TODO: This probably only requires no input flushing?
987}
988
990 LLT DestTy, LLT SrcTy) const {
991 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
992 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
993 DestTy.getScalarSizeInBits() == 32 &&
994 SrcTy.getScalarSizeInBits() == 16 &&
995 // TODO: This probably only requires no input flushing?
997}
998
1000 // SI has some legal vector types, but no legal vector operations. Say no
1001 // shuffles are legal in order to prefer scalarizing some vector operations.
1002 return false;
1003}
1004
1007 EVT VT) const {
1010
1011 if (VT.isVector()) {
1012 EVT ScalarVT = VT.getScalarType();
1013 unsigned Size = ScalarVT.getSizeInBits();
1014 if (Size == 16) {
1015 if (Subtarget->has16BitInsts()) {
1016 if (VT.isInteger())
1017 return MVT::v2i16;
1018 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1019 }
1020 return VT.isInteger() ? MVT::i32 : MVT::f32;
1021 }
1022
1023 if (Size < 16)
1024 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1025 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1026 }
1027
1028 if (VT.getSizeInBits() > 32)
1029 return MVT::i32;
1030
1032}
1033
1036 EVT VT) const {
1039
1040 if (VT.isVector()) {
1041 unsigned NumElts = VT.getVectorNumElements();
1042 EVT ScalarVT = VT.getScalarType();
1043 unsigned Size = ScalarVT.getSizeInBits();
1044
1045 // FIXME: Should probably promote 8-bit vectors to i16.
1046 if (Size == 16 && Subtarget->has16BitInsts())
1047 return (NumElts + 1) / 2;
1048
1049 if (Size <= 32)
1050 return NumElts;
1051
1052 if (Size > 32)
1053 return NumElts * ((Size + 31) / 32);
1054 } else if (VT.getSizeInBits() > 32)
1055 return (VT.getSizeInBits() + 31) / 32;
1056
1058}
1059
1061 LLVMContext &Context, CallingConv::ID CC,
1062 EVT VT, EVT &IntermediateVT,
1063 unsigned &NumIntermediates, MVT &RegisterVT) const {
1064 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1065 unsigned NumElts = VT.getVectorNumElements();
1066 EVT ScalarVT = VT.getScalarType();
1067 unsigned Size = ScalarVT.getSizeInBits();
1068 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1069 // support, but unless we can properly handle 3-vectors, it will be still be
1070 // inconsistent.
1071 if (Size == 16 && Subtarget->has16BitInsts()) {
1072 if (ScalarVT == MVT::bf16) {
1073 RegisterVT = MVT::i32;
1074 IntermediateVT = MVT::v2bf16;
1075 } else {
1076 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1077 IntermediateVT = RegisterVT;
1078 }
1079 NumIntermediates = (NumElts + 1) / 2;
1080 return NumIntermediates;
1081 }
1082
1083 if (Size == 32) {
1084 RegisterVT = ScalarVT.getSimpleVT();
1085 IntermediateVT = RegisterVT;
1086 NumIntermediates = NumElts;
1087 return NumIntermediates;
1088 }
1089
1090 if (Size < 16 && Subtarget->has16BitInsts()) {
1091 // FIXME: Should probably form v2i16 pieces
1092 RegisterVT = MVT::i16;
1093 IntermediateVT = ScalarVT;
1094 NumIntermediates = NumElts;
1095 return NumIntermediates;
1096 }
1097
1098
1099 if (Size != 16 && Size <= 32) {
1100 RegisterVT = MVT::i32;
1101 IntermediateVT = ScalarVT;
1102 NumIntermediates = NumElts;
1103 return NumIntermediates;
1104 }
1105
1106 if (Size > 32) {
1107 RegisterVT = MVT::i32;
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts * ((Size + 31) / 32);
1110 return NumIntermediates;
1111 }
1112 }
1113
1115 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1116}
1117
1119 const DataLayout &DL, Type *Ty,
1120 unsigned MaxNumLanes) {
1121 assert(MaxNumLanes != 0);
1122
1123 LLVMContext &Ctx = Ty->getContext();
1124 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1125 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1126 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1127 NumElts);
1128 }
1129
1130 return TLI.getValueType(DL, Ty);
1131}
1132
1133// Peek through TFE struct returns to only use the data size.
1135 const DataLayout &DL, Type *Ty,
1136 unsigned MaxNumLanes) {
1137 auto *ST = dyn_cast<StructType>(Ty);
1138 if (!ST)
1139 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1140
1141 // TFE intrinsics return an aggregate type.
1142 assert(ST->getNumContainedTypes() == 2 &&
1143 ST->getContainedType(1)->isIntegerTy(32));
1144 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1145}
1146
1147/// Map address space 7 to MVT::v5i32 because that's its in-memory
1148/// representation. This return value is vector-typed because there is no
1149/// MVT::i160 and it is not clear if one can be added. While this could
1150/// cause issues during codegen, these address space 7 pointers will be
1151/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1152/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1153/// modeling, to work.
1155 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1156 return MVT::v5i32;
1158 DL.getPointerSizeInBits(AS) == 192)
1159 return MVT::v6i32;
1161}
1162/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1163/// v8i32 when padding is added.
1164/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1165/// also v8i32 with padding.
1167 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1168 DL.getPointerSizeInBits(AS) == 160) ||
1170 DL.getPointerSizeInBits(AS) == 192))
1171 return MVT::v8i32;
1173}
1174
1176 const CallInst &CI,
1177 MachineFunction &MF,
1178 unsigned IntrID) const {
1180 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1182
1183 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1186 (Intrinsic::ID)IntrID);
1187 MemoryEffects ME = Attr.getMemoryEffects();
1188 if (ME.doesNotAccessMemory())
1189 return false;
1190
1191 // TODO: Should images get their own address space?
1192 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1193
1194 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1195 if (RsrcIntr->IsImage) {
1198 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1199 Info.align.reset();
1200 }
1201
1202 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1203 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1204 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1205 // We conservatively set the memory operand of a buffer intrinsic to the
1206 // base resource pointer, so that we can access alias information about
1207 // those pointers. Cases like "this points at the same value
1208 // but with a different offset" are handled in
1209 // areMemAccessesTriviallyDisjoint.
1210 Info.ptrVal = RsrcArg;
1211 }
1212
1213 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1214 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1217 if (ME.onlyReadsMemory()) {
1218 if (RsrcIntr->IsImage) {
1219 unsigned MaxNumLanes = 4;
1220
1221 if (!BaseOpcode->Gather4) {
1222 // If this isn't a gather, we may have excess loaded elements in the
1223 // IR type. Check the dmask for the real number of elements loaded.
1224 unsigned DMask
1225 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1226 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1227 }
1228
1229 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1230 CI.getType(), MaxNumLanes);
1231 } else {
1232 Info.memVT =
1234 std::numeric_limits<unsigned>::max());
1235 }
1236
1237 // FIXME: What does alignment mean for an image?
1240 } else if (ME.onlyWritesMemory()) {
1242
1243 Type *DataTy = CI.getArgOperand(0)->getType();
1244 if (RsrcIntr->IsImage) {
1245 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1246 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1247 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1248 DMaskLanes);
1249 } else
1250 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1251
1253 } else {
1254 // Atomic or NoReturn Sampler
1255 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1260
1261 switch (IntrID) {
1262 default:
1263 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1264 // Fake memory access type for no return sampler intrinsics
1265 Info.memVT = MVT::i32;
1266 } else {
1267 // XXX - Should this be volatile without known ordering?
1269 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1270 }
1271 break;
1272 case Intrinsic::amdgcn_raw_buffer_load_lds:
1273 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_buffer_load_lds:
1275 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1276 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1277 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1278 Info.ptrVal = CI.getArgOperand(1);
1279 return true;
1280 }
1281 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1282 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1283 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1284 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1285 Info.memVT =
1287 std::numeric_limits<unsigned>::max());
1288 Info.flags &= ~MachineMemOperand::MOStore;
1289 return true;
1290 }
1291 }
1292 }
1293 return true;
1294 }
1295
1296 switch (IntrID) {
1297 case Intrinsic::amdgcn_ds_ordered_add:
1298 case Intrinsic::amdgcn_ds_ordered_swap: {
1300 Info.memVT = MVT::getVT(CI.getType());
1301 Info.ptrVal = CI.getOperand(0);
1302 Info.align.reset();
1304
1305 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1306 if (!Vol->isZero())
1308
1309 return true;
1310 }
1311 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1312 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1314 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1315 Info.ptrVal = nullptr;
1316 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1318 return true;
1319 }
1320 case Intrinsic::amdgcn_ds_append:
1321 case Intrinsic::amdgcn_ds_consume: {
1323 Info.memVT = MVT::getVT(CI.getType());
1324 Info.ptrVal = CI.getOperand(0);
1325 Info.align.reset();
1327
1328 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1329 if (!Vol->isZero())
1331
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_csub: {
1336 Info.memVT = MVT::getVT(CI.getType());
1337 Info.ptrVal = CI.getOperand(0);
1338 Info.align.reset();
1342 return true;
1343 }
1344 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1346 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1347
1348 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1349 Info.align.reset();
1352 return true;
1353 }
1354 case Intrinsic::amdgcn_global_atomic_fadd:
1355 case Intrinsic::amdgcn_global_atomic_fmin:
1356 case Intrinsic::amdgcn_global_atomic_fmax:
1357 case Intrinsic::amdgcn_global_atomic_fmin_num:
1358 case Intrinsic::amdgcn_global_atomic_fmax_num:
1359 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1360 case Intrinsic::amdgcn_flat_atomic_fadd:
1361 case Intrinsic::amdgcn_flat_atomic_fmin:
1362 case Intrinsic::amdgcn_flat_atomic_fmax:
1363 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1364 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1365 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1366 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1367 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1369 Info.memVT = MVT::getVT(CI.getType());
1370 Info.ptrVal = CI.getOperand(0);
1371 Info.align.reset();
1376 return true;
1377 }
1378 case Intrinsic::amdgcn_global_load_tr_b64:
1379 case Intrinsic::amdgcn_global_load_tr_b128: {
1381 Info.memVT = MVT::getVT(CI.getType());
1382 Info.ptrVal = CI.getOperand(0);
1383 Info.align.reset();
1385 return true;
1386 }
1387 case Intrinsic::amdgcn_ds_gws_init:
1388 case Intrinsic::amdgcn_ds_gws_barrier:
1389 case Intrinsic::amdgcn_ds_gws_sema_v:
1390 case Intrinsic::amdgcn_ds_gws_sema_br:
1391 case Intrinsic::amdgcn_ds_gws_sema_p:
1392 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1394
1395 const GCNTargetMachine &TM =
1396 static_cast<const GCNTargetMachine &>(getTargetMachine());
1397
1399 Info.ptrVal = MFI->getGWSPSV(TM);
1400
1401 // This is an abstract access, but we need to specify a type and size.
1402 Info.memVT = MVT::i32;
1403 Info.size = 4;
1404 Info.align = Align(4);
1405
1406 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1408 else
1410 return true;
1411 }
1412 case Intrinsic::amdgcn_global_load_lds: {
1414 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1415 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1416 Info.ptrVal = CI.getArgOperand(1);
1418 return true;
1419 }
1420 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1422
1423 const GCNTargetMachine &TM =
1424 static_cast<const GCNTargetMachine &>(getTargetMachine());
1425
1427 Info.ptrVal = MFI->getGWSPSV(TM);
1428
1429 // This is an abstract access, but we need to specify a type and size.
1430 Info.memVT = MVT::i32;
1431 Info.size = 4;
1432 Info.align = Align(4);
1433
1435 return true;
1436 }
1437 default:
1438 return false;
1439 }
1440}
1441
1443 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1444 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1445 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1446 // The DAG's ValueType loses the addrspaces.
1447 // Add them as 2 extra Constant operands "from" and "to".
1448 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1449 unsigned DstAS = I.getType()->getPointerAddressSpace();
1450 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1451 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1452 break;
1453 }
1454 default:
1455 break;
1456 }
1457}
1458
1461 Type *&AccessTy) const {
1462 Value *Ptr = nullptr;
1463 switch (II->getIntrinsicID()) {
1464 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1465 case Intrinsic::amdgcn_ds_append:
1466 case Intrinsic::amdgcn_ds_consume:
1467 case Intrinsic::amdgcn_ds_ordered_add:
1468 case Intrinsic::amdgcn_ds_ordered_swap:
1469 case Intrinsic::amdgcn_flat_atomic_fadd:
1470 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1471 case Intrinsic::amdgcn_flat_atomic_fmax:
1472 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1473 case Intrinsic::amdgcn_flat_atomic_fmin:
1474 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1475 case Intrinsic::amdgcn_global_atomic_csub:
1476 case Intrinsic::amdgcn_global_atomic_fadd:
1477 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1478 case Intrinsic::amdgcn_global_atomic_fmax:
1479 case Intrinsic::amdgcn_global_atomic_fmax_num:
1480 case Intrinsic::amdgcn_global_atomic_fmin:
1481 case Intrinsic::amdgcn_global_atomic_fmin_num:
1482 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1483 case Intrinsic::amdgcn_global_load_tr_b64:
1484 case Intrinsic::amdgcn_global_load_tr_b128:
1485 Ptr = II->getArgOperand(0);
1486 break;
1487 case Intrinsic::amdgcn_global_load_lds:
1488 Ptr = II->getArgOperand(1);
1489 break;
1490 default:
1491 return false;
1492 }
1493 AccessTy = II->getType();
1494 Ops.push_back(Ptr);
1495 return true;
1496}
1497
1499 unsigned AddrSpace) const {
1500 if (!Subtarget->hasFlatInstOffsets()) {
1501 // Flat instructions do not have offsets, and only have the register
1502 // address.
1503 return AM.BaseOffs == 0 && AM.Scale == 0;
1504 }
1505
1506 decltype(SIInstrFlags::FLAT) FlatVariant =
1510
1511 return AM.Scale == 0 &&
1512 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1513 AM.BaseOffs, AddrSpace, FlatVariant));
1514}
1515
1517 if (Subtarget->hasFlatGlobalInsts())
1519
1520 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1521 // Assume the we will use FLAT for all global memory accesses
1522 // on VI.
1523 // FIXME: This assumption is currently wrong. On VI we still use
1524 // MUBUF instructions for the r + i addressing mode. As currently
1525 // implemented, the MUBUF instructions only work on buffer < 4GB.
1526 // It may be possible to support > 4GB buffers with MUBUF instructions,
1527 // by setting the stride value in the resource descriptor which would
1528 // increase the size limit to (stride * 4GB). However, this is risky,
1529 // because it has never been validated.
1531 }
1532
1533 return isLegalMUBUFAddressingMode(AM);
1534}
1535
1536bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1537 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1538 // additionally can do r + r + i with addr64. 32-bit has more addressing
1539 // mode options. Depending on the resource constant, it can also do
1540 // (i64 r0) + (i32 r1) * (i14 i).
1541 //
1542 // Private arrays end up using a scratch buffer most of the time, so also
1543 // assume those use MUBUF instructions. Scratch loads / stores are currently
1544 // implemented as mubuf instructions with offen bit set, so slightly
1545 // different than the normal addr64.
1546 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1547 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1548 return false;
1549
1550 // FIXME: Since we can split immediate into soffset and immediate offset,
1551 // would it make sense to allow any immediate?
1552
1553 switch (AM.Scale) {
1554 case 0: // r + i or just i, depending on HasBaseReg.
1555 return true;
1556 case 1:
1557 return true; // We have r + r or r + i.
1558 case 2:
1559 if (AM.HasBaseReg) {
1560 // Reject 2 * r + r.
1561 return false;
1562 }
1563
1564 // Allow 2 * r as r + r
1565 // Or 2 * r + i is allowed as r + r + i.
1566 return true;
1567 default: // Don't allow n * r
1568 return false;
1569 }
1570}
1571
1573 const AddrMode &AM, Type *Ty,
1574 unsigned AS, Instruction *I) const {
1575 // No global is ever allowed as a base.
1576 if (AM.BaseGV)
1577 return false;
1578
1579 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1580 return isLegalGlobalAddressingMode(AM);
1581
1582 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1586 // If the offset isn't a multiple of 4, it probably isn't going to be
1587 // correctly aligned.
1588 // FIXME: Can we get the real alignment here?
1589 if (AM.BaseOffs % 4 != 0)
1590 return isLegalMUBUFAddressingMode(AM);
1591
1592 if (!Subtarget->hasScalarSubwordLoads()) {
1593 // There are no SMRD extloads, so if we have to do a small type access we
1594 // will use a MUBUF load.
1595 // FIXME?: We also need to do this if unaligned, but we don't know the
1596 // alignment here.
1597 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1598 return isLegalGlobalAddressingMode(AM);
1599 }
1600
1602 // SMRD instructions have an 8-bit, dword offset on SI.
1603 if (!isUInt<8>(AM.BaseOffs / 4))
1604 return false;
1605 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1606 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1607 // in 8-bits, it can use a smaller encoding.
1608 if (!isUInt<32>(AM.BaseOffs / 4))
1609 return false;
1610 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1611 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1612 if (!isUInt<20>(AM.BaseOffs))
1613 return false;
1614 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1615 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1616 // for S_BUFFER_* instructions).
1617 if (!isInt<21>(AM.BaseOffs))
1618 return false;
1619 } else {
1620 // On GFX12, all offsets are signed 24-bit in bytes.
1621 if (!isInt<24>(AM.BaseOffs))
1622 return false;
1623 }
1624
1625 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1627 AM.BaseOffs < 0) {
1628 // Scalar (non-buffer) loads can only use a negative offset if
1629 // soffset+offset is non-negative. Since the compiler can only prove that
1630 // in a few special cases, it is safer to claim that negative offsets are
1631 // not supported.
1632 return false;
1633 }
1634
1635 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1636 return true;
1637
1638 if (AM.Scale == 1 && AM.HasBaseReg)
1639 return true;
1640
1641 return false;
1642 }
1643
1644 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1645 return Subtarget->enableFlatScratch()
1647 : isLegalMUBUFAddressingMode(AM);
1648
1649 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1650 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1651 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1652 // field.
1653 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1654 // an 8-bit dword offset but we don't know the alignment here.
1655 if (!isUInt<16>(AM.BaseOffs))
1656 return false;
1657
1658 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1659 return true;
1660
1661 if (AM.Scale == 1 && AM.HasBaseReg)
1662 return true;
1663
1664 return false;
1665 }
1666
1668 // For an unknown address space, this usually means that this is for some
1669 // reason being used for pure arithmetic, and not based on some addressing
1670 // computation. We don't have instructions that compute pointers with any
1671 // addressing modes, so treat them as having no offset like flat
1672 // instructions.
1674 }
1675
1676 // Assume a user alias of global for unknown address spaces.
1677 return isLegalGlobalAddressingMode(AM);
1678}
1679
1681 const MachineFunction &MF) const {
1683 return (MemVT.getSizeInBits() <= 4 * 32);
1684 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1685 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1686 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1687 }
1689 return (MemVT.getSizeInBits() <= 2 * 32);
1690 return true;
1691}
1692
1694 unsigned Size, unsigned AddrSpace, Align Alignment,
1695 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1696 if (IsFast)
1697 *IsFast = 0;
1698
1699 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1700 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1701 // Check if alignment requirements for ds_read/write instructions are
1702 // disabled.
1703 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1704 return false;
1705
1706 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1707 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1708 Alignment < RequiredAlignment)
1709 return false;
1710
1711 // Either, the alignment requirements are "enabled", or there is an
1712 // unaligned LDS access related hardware bug though alignment requirements
1713 // are "disabled". In either case, we need to check for proper alignment
1714 // requirements.
1715 //
1716 switch (Size) {
1717 case 64:
1718 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1719 // address is negative, then the instruction is incorrectly treated as
1720 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1721 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1722 // load later in the SILoadStoreOptimizer.
1723 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1724 return false;
1725
1726 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1727 // can do a 4 byte aligned, 8 byte access in a single operation using
1728 // ds_read2/write2_b32 with adjacent offsets.
1729 RequiredAlignment = Align(4);
1730
1731 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1732 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1733 // ds_write2_b32 depending on the alignment. In either case with either
1734 // alignment there is no faster way of doing this.
1735
1736 // The numbers returned here and below are not additive, it is a 'speed
1737 // rank'. They are just meant to be compared to decide if a certain way
1738 // of lowering an operation is faster than another. For that purpose
1739 // naturally aligned operation gets it bitsize to indicate that "it
1740 // operates with a speed comparable to N-bit wide load". With the full
1741 // alignment ds128 is slower than ds96 for example. If underaligned it
1742 // is comparable to a speed of a single dword access, which would then
1743 // mean 32 < 128 and it is faster to issue a wide load regardless.
1744 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1745 // wider load which will not be aligned anymore the latter is slower.
1746 if (IsFast)
1747 *IsFast = (Alignment >= RequiredAlignment) ? 64
1748 : (Alignment < Align(4)) ? 32
1749 : 1;
1750 return true;
1751 }
1752
1753 break;
1754 case 96:
1755 if (!Subtarget->hasDS96AndDS128())
1756 return false;
1757
1758 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1759 // gfx8 and older.
1760
1761 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1762 // Naturally aligned access is fastest. However, also report it is Fast
1763 // if memory is aligned less than DWORD. A narrow load or store will be
1764 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1765 // be more of them, so overall we will pay less penalty issuing a single
1766 // instruction.
1767
1768 // See comment on the values above.
1769 if (IsFast)
1770 *IsFast = (Alignment >= RequiredAlignment) ? 96
1771 : (Alignment < Align(4)) ? 32
1772 : 1;
1773 return true;
1774 }
1775
1776 break;
1777 case 128:
1778 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1779 return false;
1780
1781 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1782 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1783 // single operation using ds_read2/write2_b64.
1784 RequiredAlignment = Align(8);
1785
1786 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1787 // Naturally aligned access is fastest. However, also report it is Fast
1788 // if memory is aligned less than DWORD. A narrow load or store will be
1789 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1790 // will be more of them, so overall we will pay less penalty issuing a
1791 // single instruction.
1792
1793 // See comment on the values above.
1794 if (IsFast)
1795 *IsFast = (Alignment >= RequiredAlignment) ? 128
1796 : (Alignment < Align(4)) ? 32
1797 : 1;
1798 return true;
1799 }
1800
1801 break;
1802 default:
1803 if (Size > 32)
1804 return false;
1805
1806 break;
1807 }
1808
1809 // See comment on the values above.
1810 // Note that we have a single-dword or sub-dword here, so if underaligned
1811 // it is a slowest possible access, hence returned value is 0.
1812 if (IsFast)
1813 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1814
1815 return Alignment >= RequiredAlignment ||
1816 Subtarget->hasUnalignedDSAccessEnabled();
1817 }
1818
1819 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1820 bool AlignedBy4 = Alignment >= Align(4);
1821 if (IsFast)
1822 *IsFast = AlignedBy4;
1823
1824 return AlignedBy4 ||
1825 Subtarget->enableFlatScratch() ||
1826 Subtarget->hasUnalignedScratchAccess();
1827 }
1828
1829 // FIXME: We have to be conservative here and assume that flat operations
1830 // will access scratch. If we had access to the IR function, then we
1831 // could determine if any private memory was used in the function.
1832 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1833 !Subtarget->hasUnalignedScratchAccess()) {
1834 bool AlignedBy4 = Alignment >= Align(4);
1835 if (IsFast)
1836 *IsFast = AlignedBy4;
1837
1838 return AlignedBy4;
1839 }
1840
1841 // So long as they are correct, wide global memory operations perform better
1842 // than multiple smaller memory ops -- even when misaligned
1843 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1844 if (IsFast)
1845 *IsFast = Size;
1846
1847 return Alignment >= Align(4) ||
1849 }
1850
1851 // Smaller than dword value must be aligned.
1852 if (Size < 32)
1853 return false;
1854
1855 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1856 // byte-address are ignored, thus forcing Dword alignment.
1857 // This applies to private, global, and constant memory.
1858 if (IsFast)
1859 *IsFast = 1;
1860
1861 return Size >= 32 && Alignment >= Align(4);
1862}
1863
1865 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1866 unsigned *IsFast) const {
1868 Alignment, Flags, IsFast);
1869}
1870
1872 const MemOp &Op, const AttributeList &FuncAttributes) const {
1873 // FIXME: Should account for address space here.
1874
1875 // The default fallback uses the private pointer size as a guess for a type to
1876 // use. Make sure we switch these to 64-bit accesses.
1877
1878 if (Op.size() >= 16 &&
1879 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1880 return MVT::v4i32;
1881
1882 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1883 return MVT::v2i32;
1884
1885 // Use the default.
1886 return MVT::Other;
1887}
1888
1890 const MemSDNode *MemNode = cast<MemSDNode>(N);
1891 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1892}
1893
1895 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1897}
1898
1900 unsigned DestAS) const {
1901 // Flat -> private/local is a simple truncate.
1902 // Flat -> global is no-op
1903 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1904 return true;
1905
1906 const GCNTargetMachine &TM =
1907 static_cast<const GCNTargetMachine &>(getTargetMachine());
1908 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1909}
1910
1912 const MemSDNode *MemNode = cast<MemSDNode>(N);
1913
1915}
1916
1919 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1920 VT.getScalarType().bitsLE(MVT::i16))
1923}
1924
1926 Type *Ty) const {
1927 // FIXME: Could be smarter if called for vector constants.
1928 return true;
1929}
1930
1932 unsigned Index) const {
1934 return false;
1935
1936 // TODO: Add more cases that are cheap.
1937 return Index == 0;
1938}
1939
1941 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1942 switch (Op) {
1943 case ISD::LOAD:
1944 case ISD::STORE:
1945
1946 // These operations are done with 32-bit instructions anyway.
1947 case ISD::AND:
1948 case ISD::OR:
1949 case ISD::XOR:
1950 case ISD::SELECT:
1951 // TODO: Extensions?
1952 return true;
1953 default:
1954 return false;
1955 }
1956 }
1957
1958 // SimplifySetCC uses this function to determine whether or not it should
1959 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1960 if (VT == MVT::i1 && Op == ISD::SETCC)
1961 return false;
1962
1964}
1965
1966SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1967 const SDLoc &SL,
1968 SDValue Chain,
1969 uint64_t Offset) const {
1970 const DataLayout &DL = DAG.getDataLayout();
1973
1974 const ArgDescriptor *InputPtrReg;
1975 const TargetRegisterClass *RC;
1976 LLT ArgTy;
1978
1979 std::tie(InputPtrReg, RC, ArgTy) =
1981
1982 // We may not have the kernarg segment argument if we have no kernel
1983 // arguments.
1984 if (!InputPtrReg)
1985 return DAG.getConstant(Offset, SL, PtrVT);
1986
1988 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1989 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1990
1991 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1992}
1993
1994SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1995 const SDLoc &SL) const {
1998 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1999}
2000
2001SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2002 const SDLoc &SL) const {
2003
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2008 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2009 return SDValue();
2010}
2011
2012SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2013 const SDLoc &SL, SDValue Val,
2014 bool Signed,
2015 const ISD::InputArg *Arg) const {
2016 // First, if it is a widened vector, narrow it.
2017 if (VT.isVector() &&
2019 EVT NarrowedVT =
2022 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2023 DAG.getConstant(0, SL, MVT::i32));
2024 }
2025
2026 // Then convert the vector elements or scalar value.
2027 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2028 VT.bitsLT(MemVT)) {
2029 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2030 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2031 }
2032
2033 if (MemVT.isFloatingPoint())
2034 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2035 else if (Signed)
2036 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2037 else
2038 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2039
2040 return Val;
2041}
2042
2043SDValue SITargetLowering::lowerKernargMemParameter(
2044 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2045 uint64_t Offset, Align Alignment, bool Signed,
2046 const ISD::InputArg *Arg) const {
2048
2049 // Try to avoid using an extload by loading earlier than the argument address,
2050 // and extracting the relevant bits. The load should hopefully be merged with
2051 // the previous argument.
2052 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2053 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2054 int64_t AlignDownOffset = alignDown(Offset, 4);
2055 int64_t OffsetDiff = Offset - AlignDownOffset;
2056
2057 EVT IntVT = MemVT.changeTypeToInteger();
2058
2059 // TODO: If we passed in the base kernel offset we could have a better
2060 // alignment than 4, but we don't really need it.
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2062 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2065
2066 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2067 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2068
2069 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2070 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2071 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2072
2073
2074 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2075 }
2076
2077 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2078 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2081
2082 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2083 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2084}
2085
2086SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2087 const SDLoc &SL, SDValue Chain,
2088 const ISD::InputArg &Arg) const {
2090 MachineFrameInfo &MFI = MF.getFrameInfo();
2091
2092 if (Arg.Flags.isByVal()) {
2093 unsigned Size = Arg.Flags.getByValSize();
2094 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2095 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2096 }
2097
2098 unsigned ArgOffset = VA.getLocMemOffset();
2099 unsigned ArgSize = VA.getValVT().getStoreSize();
2100
2101 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2102
2103 // Create load nodes to retrieve arguments from the stack.
2104 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2105 SDValue ArgValue;
2106
2107 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2109 MVT MemVT = VA.getValVT();
2110
2111 switch (VA.getLocInfo()) {
2112 default:
2113 break;
2114 case CCValAssign::BCvt:
2115 MemVT = VA.getLocVT();
2116 break;
2117 case CCValAssign::SExt:
2118 ExtType = ISD::SEXTLOAD;
2119 break;
2120 case CCValAssign::ZExt:
2121 ExtType = ISD::ZEXTLOAD;
2122 break;
2123 case CCValAssign::AExt:
2124 ExtType = ISD::EXTLOAD;
2125 break;
2126 }
2127
2128 ArgValue = DAG.getExtLoad(
2129 ExtType, SL, VA.getLocVT(), Chain, FIN,
2131 MemVT);
2132 return ArgValue;
2133}
2134
2135SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2136 const SIMachineFunctionInfo &MFI,
2137 EVT VT,
2139 const ArgDescriptor *Reg = nullptr;
2140 const TargetRegisterClass *RC;
2141 LLT Ty;
2142
2144 const ArgDescriptor WorkGroupIDX =
2145 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2146 // If GridZ is not programmed in an entry function then the hardware will set
2147 // it to all zeros, so there is no need to mask the GridY value in the low
2148 // order bits.
2149 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2150 AMDGPU::TTMP7,
2151 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2152 const ArgDescriptor WorkGroupIDZ =
2153 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2154 if (Subtarget->hasArchitectedSGPRs() &&
2156 switch (PVID) {
2158 Reg = &WorkGroupIDX;
2159 RC = &AMDGPU::SReg_32RegClass;
2160 Ty = LLT::scalar(32);
2161 break;
2163 Reg = &WorkGroupIDY;
2164 RC = &AMDGPU::SReg_32RegClass;
2165 Ty = LLT::scalar(32);
2166 break;
2168 Reg = &WorkGroupIDZ;
2169 RC = &AMDGPU::SReg_32RegClass;
2170 Ty = LLT::scalar(32);
2171 break;
2172 default:
2173 break;
2174 }
2175 }
2176
2177 if (!Reg)
2178 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2179 if (!Reg) {
2181 // It's possible for a kernarg intrinsic call to appear in a kernel with
2182 // no allocated segment, in which case we do not add the user sgpr
2183 // argument, so just return null.
2184 return DAG.getConstant(0, SDLoc(), VT);
2185 }
2186
2187 // It's undefined behavior if a function marked with the amdgpu-no-*
2188 // attributes uses the corresponding intrinsic.
2189 return DAG.getUNDEF(VT);
2190 }
2191
2192 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2193}
2194
2196 CallingConv::ID CallConv,
2197 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2198 FunctionType *FType,
2199 SIMachineFunctionInfo *Info) {
2200 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2201 const ISD::InputArg *Arg = &Ins[I];
2202
2203 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2204 "vector type argument should have been split");
2205
2206 // First check if it's a PS input addr.
2207 if (CallConv == CallingConv::AMDGPU_PS &&
2208 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2209 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2210
2211 // Inconveniently only the first part of the split is marked as isSplit,
2212 // so skip to the end. We only want to increment PSInputNum once for the
2213 // entire split argument.
2214 if (Arg->Flags.isSplit()) {
2215 while (!Arg->Flags.isSplitEnd()) {
2216 assert((!Arg->VT.isVector() ||
2217 Arg->VT.getScalarSizeInBits() == 16) &&
2218 "unexpected vector split in ps argument type");
2219 if (!SkipArg)
2220 Splits.push_back(*Arg);
2221 Arg = &Ins[++I];
2222 }
2223 }
2224
2225 if (SkipArg) {
2226 // We can safely skip PS inputs.
2227 Skipped.set(Arg->getOrigArgIndex());
2228 ++PSInputNum;
2229 continue;
2230 }
2231
2232 Info->markPSInputAllocated(PSInputNum);
2233 if (Arg->Used)
2234 Info->markPSInputEnabled(PSInputNum);
2235
2236 ++PSInputNum;
2237 }
2238
2239 Splits.push_back(*Arg);
2240 }
2241}
2242
2243// Allocate special inputs passed in VGPRs.
2245 MachineFunction &MF,
2246 const SIRegisterInfo &TRI,
2247 SIMachineFunctionInfo &Info) const {
2248 const LLT S32 = LLT::scalar(32);
2250
2251 if (Info.hasWorkItemIDX()) {
2252 Register Reg = AMDGPU::VGPR0;
2253 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2254
2255 CCInfo.AllocateReg(Reg);
2256 unsigned Mask = (Subtarget->hasPackedTID() &&
2257 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2258 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2259 }
2260
2261 if (Info.hasWorkItemIDY()) {
2262 assert(Info.hasWorkItemIDX());
2263 if (Subtarget->hasPackedTID()) {
2264 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2265 0x3ff << 10));
2266 } else {
2267 unsigned Reg = AMDGPU::VGPR1;
2268 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2269
2270 CCInfo.AllocateReg(Reg);
2271 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2272 }
2273 }
2274
2275 if (Info.hasWorkItemIDZ()) {
2276 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2277 if (Subtarget->hasPackedTID()) {
2278 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2279 0x3ff << 20));
2280 } else {
2281 unsigned Reg = AMDGPU::VGPR2;
2282 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2283
2284 CCInfo.AllocateReg(Reg);
2285 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2286 }
2287 }
2288}
2289
2290// Try to allocate a VGPR at the end of the argument list, or if no argument
2291// VGPRs are left allocating a stack slot.
2292// If \p Mask is is given it indicates bitfield position in the register.
2293// If \p Arg is given use it with new ]p Mask instead of allocating new.
2294static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2295 ArgDescriptor Arg = ArgDescriptor()) {
2296 if (Arg.isSet())
2297 return ArgDescriptor::createArg(Arg, Mask);
2298
2299 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2300 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2301 if (RegIdx == ArgVGPRs.size()) {
2302 // Spill to stack required.
2303 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2304
2305 return ArgDescriptor::createStack(Offset, Mask);
2306 }
2307
2308 unsigned Reg = ArgVGPRs[RegIdx];
2309 Reg = CCInfo.AllocateReg(Reg);
2310 assert(Reg != AMDGPU::NoRegister);
2311
2312 MachineFunction &MF = CCInfo.getMachineFunction();
2313 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2314 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2315 return ArgDescriptor::createRegister(Reg, Mask);
2316}
2317
2319 const TargetRegisterClass *RC,
2320 unsigned NumArgRegs) {
2321 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2322 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2323 if (RegIdx == ArgSGPRs.size())
2324 report_fatal_error("ran out of SGPRs for arguments");
2325
2326 unsigned Reg = ArgSGPRs[RegIdx];
2327 Reg = CCInfo.AllocateReg(Reg);
2328 assert(Reg != AMDGPU::NoRegister);
2329
2330 MachineFunction &MF = CCInfo.getMachineFunction();
2331 MF.addLiveIn(Reg, RC);
2333}
2334
2335// If this has a fixed position, we still should allocate the register in the
2336// CCInfo state. Technically we could get away with this for values passed
2337// outside of the normal argument range.
2339 const TargetRegisterClass *RC,
2340 MCRegister Reg) {
2341 Reg = CCInfo.AllocateReg(Reg);
2342 assert(Reg != AMDGPU::NoRegister);
2343 MachineFunction &MF = CCInfo.getMachineFunction();
2344 MF.addLiveIn(Reg, RC);
2345}
2346
2347static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2348 if (Arg) {
2349 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2350 Arg.getRegister());
2351 } else
2352 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2353}
2354
2355static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2356 if (Arg) {
2357 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2358 Arg.getRegister());
2359 } else
2360 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2361}
2362
2363/// Allocate implicit function VGPR arguments at the end of allocated user
2364/// arguments.
2366 CCState &CCInfo, MachineFunction &MF,
2367 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2368 const unsigned Mask = 0x3ff;
2369 ArgDescriptor Arg;
2370
2371 if (Info.hasWorkItemIDX()) {
2372 Arg = allocateVGPR32Input(CCInfo, Mask);
2373 Info.setWorkItemIDX(Arg);
2374 }
2375
2376 if (Info.hasWorkItemIDY()) {
2377 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2378 Info.setWorkItemIDY(Arg);
2379 }
2380
2381 if (Info.hasWorkItemIDZ())
2382 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2383}
2384
2385/// Allocate implicit function VGPR arguments in fixed registers.
2387 CCState &CCInfo, MachineFunction &MF,
2388 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2389 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2390 if (!Reg)
2391 report_fatal_error("failed to allocated VGPR for implicit arguments");
2392
2393 const unsigned Mask = 0x3ff;
2394 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2395 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2396 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2397}
2398
2400 CCState &CCInfo,
2401 MachineFunction &MF,
2402 const SIRegisterInfo &TRI,
2403 SIMachineFunctionInfo &Info) const {
2404 auto &ArgInfo = Info.getArgInfo();
2405 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2406
2407 // TODO: Unify handling with private memory pointers.
2408 if (UserSGPRInfo.hasDispatchPtr())
2409 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2410
2411 const Module *M = MF.getFunction().getParent();
2412 if (UserSGPRInfo.hasQueuePtr() &&
2414 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2415
2416 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2417 // constant offset from the kernarg segment.
2418 if (Info.hasImplicitArgPtr())
2419 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2420
2421 if (UserSGPRInfo.hasDispatchID())
2422 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2423
2424 // flat_scratch_init is not applicable for non-kernel functions.
2425
2426 if (Info.hasWorkGroupIDX())
2427 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2428
2429 if (Info.hasWorkGroupIDY())
2430 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2431
2432 if (Info.hasWorkGroupIDZ())
2433 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2434
2435 if (Info.hasLDSKernelId())
2436 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2437}
2438
2439// Allocate special inputs passed in user SGPRs.
2441 MachineFunction &MF,
2442 const SIRegisterInfo &TRI,
2443 SIMachineFunctionInfo &Info) const {
2444 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2445 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2446 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2447 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2448 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2449 }
2450
2451 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2452 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2453 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2454 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2455 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2456 }
2457
2458 if (UserSGPRInfo.hasDispatchPtr()) {
2459 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2460 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2461 CCInfo.AllocateReg(DispatchPtrReg);
2462 }
2463
2464 const Module *M = MF.getFunction().getParent();
2465 if (UserSGPRInfo.hasQueuePtr() &&
2467 Register QueuePtrReg = Info.addQueuePtr(TRI);
2468 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2469 CCInfo.AllocateReg(QueuePtrReg);
2470 }
2471
2472 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2474 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2475 CCInfo.AllocateReg(InputPtrReg);
2476
2477 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2478 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2479 }
2480
2481 if (UserSGPRInfo.hasDispatchID()) {
2482 Register DispatchIDReg = Info.addDispatchID(TRI);
2483 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2484 CCInfo.AllocateReg(DispatchIDReg);
2485 }
2486
2487 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2488 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2489 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2490 CCInfo.AllocateReg(FlatScratchInitReg);
2491 }
2492
2493 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2494 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2495 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2496 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2497 }
2498
2499 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2500 // these from the dispatch pointer.
2501}
2502
2503// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2504// sequential starting from the first argument.
2506 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2508 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2509 Function &F = MF.getFunction();
2510 unsigned LastExplicitArgOffset =
2511 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2512 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2513 bool InPreloadSequence = true;
2514 unsigned InIdx = 0;
2515 for (auto &Arg : F.args()) {
2516 if (!InPreloadSequence || !Arg.hasInRegAttr())
2517 break;
2518
2519 int ArgIdx = Arg.getArgNo();
2520 // Don't preload non-original args or parts not in the current preload
2521 // sequence.
2522 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2523 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2524 break;
2525
2526 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2527 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2528 InIdx++) {
2529 assert(ArgLocs[ArgIdx].isMemLoc());
2530 auto &ArgLoc = ArgLocs[InIdx];
2531 const Align KernelArgBaseAlign = Align(16);
2532 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2533 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2534 unsigned NumAllocSGPRs =
2535 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2536
2537 // Arg is preloaded into the previous SGPR.
2538 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2539 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2540 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2541 continue;
2542 }
2543
2544 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2545 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2546 // Check for free user SGPRs for preloading.
2547 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2548 SGPRInfo.getNumFreeUserSGPRs()) {
2549 InPreloadSequence = false;
2550 break;
2551 }
2552
2553 // Preload this argument.
2554 const TargetRegisterClass *RC =
2555 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2556 SmallVectorImpl<MCRegister> *PreloadRegs =
2557 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2558
2559 if (PreloadRegs->size() > 1)
2560 RC = &AMDGPU::SGPR_32RegClass;
2561 for (auto &Reg : *PreloadRegs) {
2562 assert(Reg);
2563 MF.addLiveIn(Reg, RC);
2564 CCInfo.AllocateReg(Reg);
2565 }
2566
2567 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2568 }
2569 }
2570}
2571
2573 const SIRegisterInfo &TRI,
2574 SIMachineFunctionInfo &Info) const {
2575 // Always allocate this last since it is a synthetic preload.
2576 if (Info.hasLDSKernelId()) {
2577 Register Reg = Info.addLDSKernelId();
2578 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2579 CCInfo.AllocateReg(Reg);
2580 }
2581}
2582
2583// Allocate special input registers that are initialized per-wave.
2585 MachineFunction &MF,
2587 CallingConv::ID CallConv,
2588 bool IsShader) const {
2589 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2590 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2591 // Note: user SGPRs are handled by the front-end for graphics shaders
2592 // Pad up the used user SGPRs with dead inputs.
2593
2594 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2595 // before enabling architected SGPRs for workgroup IDs.
2596 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2597
2598 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2599 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2600 // rely on it to reach 16 since if we end up having no stack usage, it will
2601 // not really be added.
2602 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2603 Info.hasWorkGroupIDY() +
2604 Info.hasWorkGroupIDZ() +
2605 Info.hasWorkGroupInfo();
2606 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2607 Register Reg = Info.addReservedUserSGPR();
2608 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2609 CCInfo.AllocateReg(Reg);
2610 }
2611 }
2612
2613 if (!HasArchitectedSGPRs) {
2614 if (Info.hasWorkGroupIDX()) {
2615 Register Reg = Info.addWorkGroupIDX();
2616 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2617 CCInfo.AllocateReg(Reg);
2618 }
2619
2620 if (Info.hasWorkGroupIDY()) {
2621 Register Reg = Info.addWorkGroupIDY();
2622 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.AllocateReg(Reg);
2624 }
2625
2626 if (Info.hasWorkGroupIDZ()) {
2627 Register Reg = Info.addWorkGroupIDZ();
2628 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2629 CCInfo.AllocateReg(Reg);
2630 }
2631 }
2632
2633 if (Info.hasWorkGroupInfo()) {
2634 Register Reg = Info.addWorkGroupInfo();
2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636 CCInfo.AllocateReg(Reg);
2637 }
2638
2639 if (Info.hasPrivateSegmentWaveByteOffset()) {
2640 // Scratch wave offset passed in system SGPR.
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2642
2643 if (IsShader) {
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2646
2647 // This is true if the scratch wave byte offset doesn't have a fixed
2648 // location.
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2650 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2652 }
2653 } else
2654 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2655
2656 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2658 }
2659
2660 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2661 Info.getNumPreloadedSGPRs() >= 16);
2662}
2663
2665 MachineFunction &MF,
2666 const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) {
2668 // Now that we've figured out where the scratch register inputs are, see if
2669 // should reserve the arguments and use them directly.
2670 MachineFrameInfo &MFI = MF.getFrameInfo();
2671 bool HasStackObjects = MFI.hasStackObjects();
2672 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2673
2674 // Record that we know we have non-spill stack objects so we don't need to
2675 // check all stack objects later.
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(true);
2678
2679 // Everything live out of a block is spilled with fast regalloc, so it's
2680 // almost certain that spilling will be required.
2681 if (TM.getOptLevel() == CodeGenOptLevel::None)
2682 HasStackObjects = true;
2683
2684 // For now assume stack access is needed in any callee functions, so we need
2685 // the scratch registers to pass in.
2686 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2687
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2690 // If we have stack objects, we unquestionably need the private buffer
2691 // resource. For the Code Object V2 ABI, this will be the first 4 user
2692 // SGPR inputs. We can reserve those and use them directly.
2693
2694 Register PrivateSegmentBufferReg =
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2697 } else {
2698 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2699 // We tentatively reserve the last registers (skipping the last registers
2700 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2701 // we'll replace these with the ones immediately after those which were
2702 // really allocated. In the prologue copies will be inserted from the
2703 // argument to these reserved registers.
2704
2705 // Without HSA, relocations are used for the scratch pointer and the
2706 // buffer resource setup is always inserted in the prologue. Scratch wave
2707 // offset is still in an input SGPR.
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2709 }
2710 }
2711
2713
2714 // For entry functions we have to set up the stack pointer if we use it,
2715 // whereas non-entry functions get this "for free". This means there is no
2716 // intrinsic advantage to using S32 over S34 in cases where we do not have
2717 // calls but do need a frame pointer (i.e. if we are requested to have one
2718 // because frame pointer elimination is disabled). To keep things simple we
2719 // only ever use S32 as the call ABI stack pointer, and so using it does not
2720 // imply we need a separate frame pointer.
2721 //
2722 // Try to use s32 as the SP, but move it if it would interfere with input
2723 // arguments. This won't work with calls though.
2724 //
2725 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2726 // registers.
2727 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2729 } else {
2731
2732 if (MFI.hasCalls())
2733 report_fatal_error("call in graphics shader with too many input SGPRs");
2734
2735 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2738 break;
2739 }
2740 }
2741
2742 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2743 report_fatal_error("failed to find register for SP");
2744 }
2745
2746 // hasFP should be accurate for entry functions even before the frame is
2747 // finalized, because it does not rely on the known stack size, only
2748 // properties like whether variable sized objects are present.
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2751 }
2752}
2753
2756 return !Info->isEntryFunction();
2757}
2758
2760
2761}
2762
2764 MachineBasicBlock *Entry,
2765 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2767
2768 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2769 if (!IStart)
2770 return;
2771
2772 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2773 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2774 MachineBasicBlock::iterator MBBI = Entry->begin();
2775 for (const MCPhysReg *I = IStart; *I; ++I) {
2776 const TargetRegisterClass *RC = nullptr;
2777 if (AMDGPU::SReg_64RegClass.contains(*I))
2778 RC = &AMDGPU::SGPR_64RegClass;
2779 else if (AMDGPU::SReg_32RegClass.contains(*I))
2780 RC = &AMDGPU::SGPR_32RegClass;
2781 else
2782 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2783
2784 Register NewVR = MRI->createVirtualRegister(RC);
2785 // Create copy from CSR to a virtual register.
2786 Entry->addLiveIn(*I);
2787 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2788 .addReg(*I);
2789
2790 // Insert the copy-back instructions right before the terminator.
2791 for (auto *Exit : Exits)
2792 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2793 TII->get(TargetOpcode::COPY), *I)
2794 .addReg(NewVR);
2795 }
2796}
2797
2799 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2800 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2801 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2803
2805 const Function &Fn = MF.getFunction();
2808
2809 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2810 DiagnosticInfoUnsupported NoGraphicsHSA(
2811 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2812 DAG.getContext()->diagnose(NoGraphicsHSA);
2813 return DAG.getEntryNode();
2814 }
2815
2818 BitVector Skipped(Ins.size());
2819 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2820 *DAG.getContext());
2821
2822 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2823 bool IsKernel = AMDGPU::isKernel(CallConv);
2824 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2825
2826 if (IsGraphics) {
2827 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2828 assert(!UserSGPRInfo.hasDispatchPtr() &&
2829 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2830 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2831 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2832 (void)UserSGPRInfo;
2833 if (!Subtarget->enableFlatScratch())
2834 assert(!UserSGPRInfo.hasFlatScratchInit());
2835 if ((CallConv != CallingConv::AMDGPU_CS &&
2836 CallConv != CallingConv::AMDGPU_Gfx) ||
2837 !Subtarget->hasArchitectedSGPRs())
2838 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2839 !Info->hasWorkGroupIDZ());
2840 }
2841
2842 if (CallConv == CallingConv::AMDGPU_PS) {
2843 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2844
2845 // At least one interpolation mode must be enabled or else the GPU will
2846 // hang.
2847 //
2848 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2849 // set PSInputAddr, the user wants to enable some bits after the compilation
2850 // based on run-time states. Since we can't know what the final PSInputEna
2851 // will look like, so we shouldn't do anything here and the user should take
2852 // responsibility for the correct programming.
2853 //
2854 // Otherwise, the following restrictions apply:
2855 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2856 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2857 // enabled too.
2858 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2859 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2860 CCInfo.AllocateReg(AMDGPU::VGPR0);
2861 CCInfo.AllocateReg(AMDGPU::VGPR1);
2862 Info->markPSInputAllocated(0);
2863 Info->markPSInputEnabled(0);
2864 }
2865 if (Subtarget->isAmdPalOS()) {
2866 // For isAmdPalOS, the user does not enable some bits after compilation
2867 // based on run-time states; the register values being generated here are
2868 // the final ones set in hardware. Therefore we need to apply the
2869 // workaround to PSInputAddr and PSInputEnable together. (The case where
2870 // a bit is set in PSInputAddr but not PSInputEnable is where the
2871 // frontend set up an input arg for a particular interpolation mode, but
2872 // nothing uses that input arg. Really we should have an earlier pass
2873 // that removes such an arg.)
2874 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2875 if ((PsInputBits & 0x7F) == 0 ||
2876 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2877 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2878 }
2879 } else if (IsKernel) {
2880 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2881 } else {
2882 Splits.append(Ins.begin(), Ins.end());
2883 }
2884
2885 if (IsKernel)
2886 analyzeFormalArgumentsCompute(CCInfo, Ins);
2887
2888 if (IsEntryFunc) {
2889 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2890 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2891 if (IsKernel && Subtarget->hasKernargPreload())
2892 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2893
2894 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2895 } else if (!IsGraphics) {
2896 // For the fixed ABI, pass workitem IDs in the last argument register.
2897 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2898
2899 // FIXME: Sink this into allocateSpecialInputSGPRs
2900 if (!Subtarget->enableFlatScratch())
2901 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2902
2903 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2904 }
2905
2906 if (!IsKernel) {
2907 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2908 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2909 }
2910
2912
2913 // FIXME: This is the minimum kernel argument alignment. We should improve
2914 // this to the maximum alignment of the arguments.
2915 //
2916 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2917 // kern arg offset.
2918 const Align KernelArgBaseAlign = Align(16);
2919
2920 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2921 const ISD::InputArg &Arg = Ins[i];
2922 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2923 InVals.push_back(DAG.getUNDEF(Arg.VT));
2924 continue;
2925 }
2926
2927 CCValAssign &VA = ArgLocs[ArgIdx++];
2928 MVT VT = VA.getLocVT();
2929
2930 if (IsEntryFunc && VA.isMemLoc()) {
2931 VT = Ins[i].VT;
2932 EVT MemVT = VA.getLocVT();
2933
2934 const uint64_t Offset = VA.getLocMemOffset();
2935 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2936
2937 if (Arg.Flags.isByRef()) {
2938 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2939
2940 const GCNTargetMachine &TM =
2941 static_cast<const GCNTargetMachine &>(getTargetMachine());
2942 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2943 Arg.Flags.getPointerAddrSpace())) {
2946 }
2947
2948 InVals.push_back(Ptr);
2949 continue;
2950 }
2951
2952 SDValue NewArg;
2953 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2954 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2955 // In this case the argument is packed into the previous preload SGPR.
2956 int64_t AlignDownOffset = alignDown(Offset, 4);
2957 int64_t OffsetDiff = Offset - AlignDownOffset;
2958 EVT IntVT = MemVT.changeTypeToInteger();
2959
2963 Register Reg =
2964 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2965
2966 assert(Reg);
2967 Register VReg = MRI.getLiveInVirtReg(Reg);
2968 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2969
2970 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2971 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2972
2973 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2974 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2975 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2976 Ins[i].Flags.isSExt(), &Ins[i]);
2977
2978 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2979 } else {
2983 const SmallVectorImpl<MCRegister> &PreloadRegs =
2984 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2985
2986 SDValue Copy;
2987 if (PreloadRegs.size() == 1) {
2988 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2989 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2990 NewArg = DAG.getCopyFromReg(
2991 Chain, DL, VReg,
2993 TRI->getRegSizeInBits(*RC)));
2994
2995 } else {
2996 // If the kernarg alignment does not match the alignment of the SGPR
2997 // tuple RC that can accommodate this argument, it will be built up
2998 // via copies from from the individual SGPRs that the argument was
2999 // preloaded to.
3001 for (auto Reg : PreloadRegs) {
3002 Register VReg = MRI.getLiveInVirtReg(Reg);
3003 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3004 Elts.push_back(Copy);
3005 }
3006 NewArg =
3007 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3008 PreloadRegs.size()),
3009 DL, Elts);
3010 }
3011
3012 // If the argument was preloaded to multiple consecutive 32-bit
3013 // registers because of misalignment between addressable SGPR tuples
3014 // and the argument size, we can still assume that because of kernarg
3015 // segment alignment restrictions that NewArg's size is the same as
3016 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3017 // truncate since we cannot preload to less than a single SGPR and the
3018 // MemVT may be smaller.
3019 EVT MemVTInt =
3021 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3022 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3023
3024 NewArg = DAG.getBitcast(MemVT, NewArg);
3025 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3026 Ins[i].Flags.isSExt(), &Ins[i]);
3027 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3028 }
3029 } else {
3030 NewArg =
3031 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3032 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3033 }
3034 Chains.push_back(NewArg.getValue(1));
3035
3036 auto *ParamTy =
3037 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3039 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3040 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3041 // On SI local pointers are just offsets into LDS, so they are always
3042 // less than 16-bits. On CI and newer they could potentially be
3043 // real pointers, so we can't guarantee their size.
3044 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3045 DAG.getValueType(MVT::i16));
3046 }
3047
3048 InVals.push_back(NewArg);
3049 continue;
3050 }
3051 if (!IsEntryFunc && VA.isMemLoc()) {
3052 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3053 InVals.push_back(Val);
3054 if (!Arg.Flags.isByVal())
3055 Chains.push_back(Val.getValue(1));
3056 continue;
3057 }
3058
3059 assert(VA.isRegLoc() && "Parameter must be in a register!");
3060
3061 Register Reg = VA.getLocReg();
3062 const TargetRegisterClass *RC = nullptr;
3063 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3064 RC = &AMDGPU::VGPR_32RegClass;
3065 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3066 RC = &AMDGPU::SGPR_32RegClass;
3067 else
3068 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3069 EVT ValVT = VA.getValVT();
3070
3071 Reg = MF.addLiveIn(Reg, RC);
3072 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3073
3074 if (Arg.Flags.isSRet()) {
3075 // The return object should be reasonably addressable.
3076
3077 // FIXME: This helps when the return is a real sret. If it is a
3078 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3079 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3080 unsigned NumBits
3082 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3083 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3084 }
3085
3086 // If this is an 8 or 16-bit value, it is really passed promoted
3087 // to 32 bits. Insert an assert[sz]ext to capture this, then
3088 // truncate to the right size.
3089 switch (VA.getLocInfo()) {
3090 case CCValAssign::Full:
3091 break;
3092 case CCValAssign::BCvt:
3093 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3094 break;
3095 case CCValAssign::SExt:
3096 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3097 DAG.getValueType(ValVT));
3098 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3099 break;
3100 case CCValAssign::ZExt:
3101 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3102 DAG.getValueType(ValVT));
3103 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3104 break;
3105 case CCValAssign::AExt:
3106 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3107 break;
3108 default:
3109 llvm_unreachable("Unknown loc info!");
3110 }
3111
3112 InVals.push_back(Val);
3113 }
3114
3115 // Start adding system SGPRs.
3116 if (IsEntryFunc)
3117 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3118
3119 // DAG.getPass() returns nullptr when using new pass manager.
3120 // TODO: Use DAG.getMFAM() to access analysis result.
3121 if (DAG.getPass()) {
3122 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3123 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3124 }
3125
3126 unsigned StackArgSize = CCInfo.getStackSize();
3127 Info->setBytesInStackArgArea(StackArgSize);
3128
3129 return Chains.empty() ? Chain :
3130 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3131}
3132
3133// TODO: If return values can't fit in registers, we should return as many as
3134// possible in registers before passing on stack.
3136 CallingConv::ID CallConv,
3137 MachineFunction &MF, bool IsVarArg,
3139 LLVMContext &Context) const {
3140 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3141 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3142 // for shaders. Vector types should be explicitly handled by CC.
3143 if (AMDGPU::isEntryFunctionCC(CallConv))
3144 return true;
3145
3147 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3148 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3149 return false;
3150
3151 // We must use the stack if return would require unavailable registers.
3152 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3153 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3154 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3155 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3156 return false;
3157
3158 return true;
3159}
3160
3161SDValue
3163 bool isVarArg,
3165 const SmallVectorImpl<SDValue> &OutVals,
3166 const SDLoc &DL, SelectionDAG &DAG) const {
3169
3170 if (AMDGPU::isKernel(CallConv)) {
3171 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3172 OutVals, DL, DAG);
3173 }
3174
3175 bool IsShader = AMDGPU::isShader(CallConv);
3176
3177 Info->setIfReturnsVoid(Outs.empty());
3178 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3179
3180 // CCValAssign - represent the assignment of the return value to a location.
3183
3184 // CCState - Info about the registers and stack slots.
3185 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3186 *DAG.getContext());
3187
3188 // Analyze outgoing return values.
3189 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3190
3191 SDValue Glue;
3193 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3194
3195 // Copy the result values into the output registers.
3196 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3197 ++I, ++RealRVLocIdx) {
3198 CCValAssign &VA = RVLocs[I];
3199 assert(VA.isRegLoc() && "Can only return in registers!");
3200 // TODO: Partially return in registers if return values don't fit.
3201 SDValue Arg = OutVals[RealRVLocIdx];
3202
3203 // Copied from other backends.
3204 switch (VA.getLocInfo()) {
3205 case CCValAssign::Full:
3206 break;
3207 case CCValAssign::BCvt:
3208 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3209 break;
3210 case CCValAssign::SExt:
3211 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3212 break;
3213 case CCValAssign::ZExt:
3214 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3215 break;
3216 case CCValAssign::AExt:
3217 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3218 break;
3219 default:
3220 llvm_unreachable("Unknown loc info!");
3221 }
3222
3223 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3224 Glue = Chain.getValue(1);
3225 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3226 }
3227
3228 // FIXME: Does sret work properly?
3229 if (!Info->isEntryFunction()) {
3230 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3231 const MCPhysReg *I =
3232 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3233 if (I) {
3234 for (; *I; ++I) {
3235 if (AMDGPU::SReg_64RegClass.contains(*I))
3236 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3237 else if (AMDGPU::SReg_32RegClass.contains(*I))
3238 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3239 else
3240 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3241 }
3242 }
3243 }
3244
3245 // Update chain and glue.
3246 RetOps[0] = Chain;
3247 if (Glue.getNode())
3248 RetOps.push_back(Glue);
3249
3250 unsigned Opc = AMDGPUISD::ENDPGM;
3251 if (!IsWaveEnd)
3253 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3254}
3255
3257 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3258 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3259 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3260 SDValue ThisVal) const {
3261 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3262
3263 // Assign locations to each value returned by this call.
3265 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3266 *DAG.getContext());
3267 CCInfo.AnalyzeCallResult(Ins, RetCC);
3268
3269 // Copy all of the result registers out of their specified physreg.
3270 for (CCValAssign VA : RVLocs) {
3271 SDValue Val;
3272
3273 if (VA.isRegLoc()) {
3274 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3275 Chain = Val.getValue(1);
3276 InGlue = Val.getValue(2);
3277 } else if (VA.isMemLoc()) {
3278 report_fatal_error("TODO: return values in memory");
3279 } else
3280 llvm_unreachable("unknown argument location type");
3281
3282 switch (VA.getLocInfo()) {
3283 case CCValAssign::Full:
3284 break;
3285 case CCValAssign::BCvt:
3286 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3287 break;
3288 case CCValAssign::ZExt:
3289 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3290 DAG.getValueType(VA.getValVT()));
3291 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3292 break;
3293 case CCValAssign::SExt:
3294 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3295 DAG.getValueType(VA.getValVT()));
3296 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3297 break;
3298 case CCValAssign::AExt:
3299 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3300 break;
3301 default:
3302 llvm_unreachable("Unknown loc info!");
3303 }
3304
3305 InVals.push_back(Val);
3306 }
3307
3308 return Chain;
3309}
3310
3311// Add code to pass special inputs required depending on used features separate
3312// from the explicit user arguments present in the IR.
3314 CallLoweringInfo &CLI,
3315 CCState &CCInfo,
3316 const SIMachineFunctionInfo &Info,
3317 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3318 SmallVectorImpl<SDValue> &MemOpChains,
3319 SDValue Chain) const {
3320 // If we don't have a call site, this was a call inserted by
3321 // legalization. These can never use special inputs.
3322 if (!CLI.CB)
3323 return;
3324
3325 SelectionDAG &DAG = CLI.DAG;
3326 const SDLoc &DL = CLI.DL;
3327 const Function &F = DAG.getMachineFunction().getFunction();
3328
3329 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3330 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3331
3332 const AMDGPUFunctionArgInfo *CalleeArgInfo
3334 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3335 // DAG.getPass() returns nullptr when using new pass manager.
3336 // TODO: Use DAG.getMFAM() to access analysis result.
3337 if (DAG.getPass()) {
3338 auto &ArgUsageInfo =
3340 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3341 }
3342 }
3343
3344 // TODO: Unify with private memory register handling. This is complicated by
3345 // the fact that at least in kernels, the input argument is not necessarily
3346 // in the same location as the input.
3347 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3349 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3350 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3351 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3352 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3353 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3354 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3355 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3356 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3357 };
3358
3359 for (auto Attr : ImplicitAttrs) {
3360 const ArgDescriptor *OutgoingArg;
3361 const TargetRegisterClass *ArgRC;
3362 LLT ArgTy;
3363
3364 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3365
3366 // If the callee does not use the attribute value, skip copying the value.
3367 if (CLI.CB->hasFnAttr(Attr.second))
3368 continue;
3369
3370 std::tie(OutgoingArg, ArgRC, ArgTy) =
3371 CalleeArgInfo->getPreloadedValue(InputID);
3372 if (!OutgoingArg)
3373 continue;
3374
3375 const ArgDescriptor *IncomingArg;
3376 const TargetRegisterClass *IncomingArgRC;
3377 LLT Ty;
3378 std::tie(IncomingArg, IncomingArgRC, Ty) =
3379 CallerArgInfo.getPreloadedValue(InputID);
3380 assert(IncomingArgRC == ArgRC);
3381
3382 // All special arguments are ints for now.
3383 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3384 SDValue InputReg;
3385
3386 if (IncomingArg) {
3387 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3388 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3389 // The implicit arg ptr is special because it doesn't have a corresponding
3390 // input for kernels, and is computed from the kernarg segment pointer.
3391 InputReg = getImplicitArgPtr(DAG, DL);
3392 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3393 std::optional<uint32_t> Id =
3395 if (Id.has_value()) {
3396 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3397 } else {
3398 InputReg = DAG.getUNDEF(ArgVT);
3399 }
3400 } else {
3401 // We may have proven the input wasn't needed, although the ABI is
3402 // requiring it. We just need to allocate the register appropriately.
3403 InputReg = DAG.getUNDEF(ArgVT);
3404 }
3405
3406 if (OutgoingArg->isRegister()) {
3407 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3408 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3409 report_fatal_error("failed to allocate implicit input argument");
3410 } else {
3411 unsigned SpecialArgOffset =
3412 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3413 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3414 SpecialArgOffset);
3415 MemOpChains.push_back(ArgStore);
3416 }
3417 }
3418
3419 // Pack workitem IDs into a single register or pass it as is if already
3420 // packed.
3421 const ArgDescriptor *OutgoingArg;
3422 const TargetRegisterClass *ArgRC;
3423 LLT Ty;
3424
3425 std::tie(OutgoingArg, ArgRC, Ty) =
3427 if (!OutgoingArg)
3428 std::tie(OutgoingArg, ArgRC, Ty) =
3430 if (!OutgoingArg)
3431 std::tie(OutgoingArg, ArgRC, Ty) =
3433 if (!OutgoingArg)
3434 return;
3435
3436 const ArgDescriptor *IncomingArgX = std::get<0>(
3438 const ArgDescriptor *IncomingArgY = std::get<0>(
3440 const ArgDescriptor *IncomingArgZ = std::get<0>(
3442
3443 SDValue InputReg;
3444 SDLoc SL;
3445
3446 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3447 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3448 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3449
3450 // If incoming ids are not packed we need to pack them.
3451 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3452 NeedWorkItemIDX) {
3453 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3454 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3455 } else {
3456 InputReg = DAG.getConstant(0, DL, MVT::i32);
3457 }
3458 }
3459
3460 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3461 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3462 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3463 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3464 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3465 InputReg = InputReg.getNode() ?
3466 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3467 }
3468
3469 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3470 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3471 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3472 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3473 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3474 InputReg = InputReg.getNode() ?
3475 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3476 }
3477
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3480 // We're in a situation where the outgoing function requires the workitem
3481 // ID, but the calling function does not have it (e.g a graphics function
3482 // calling a C calling convention function). This is illegal, but we need
3483 // to produce something.
3484 InputReg = DAG.getUNDEF(MVT::i32);
3485 } else {
3486 // Workitem ids are already packed, any of present incoming arguments
3487 // will carry all required fields.
3489 IncomingArgX ? *IncomingArgX :
3490 IncomingArgY ? *IncomingArgY :
3491 *IncomingArgZ, ~0u);
3492 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3493 }
3494 }
3495
3496 if (OutgoingArg->isRegister()) {
3497 if (InputReg)
3498 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3499
3500 CCInfo.AllocateReg(OutgoingArg->getRegister());
3501 } else {
3502 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3503 if (InputReg) {
3504 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3505 SpecialArgOffset);
3506 MemOpChains.push_back(ArgStore);
3507 }
3508 }
3509}
3510
3512 return CC == CallingConv::Fast;
3513}
3514
3515/// Return true if we might ever do TCO for calls with this calling convention.
3517 switch (CC) {
3518 case CallingConv::C:
3520 return true;
3521 default:
3522 return canGuaranteeTCO(CC);
3523 }
3524}
3525
3527 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3529 const SmallVectorImpl<SDValue> &OutVals,
3530 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3531 if (AMDGPU::isChainCC(CalleeCC))
3532 return true;
3533
3534 if (!mayTailCallThisCC(CalleeCC))
3535 return false;
3536
3537 // For a divergent call target, we need to do a waterfall loop over the
3538 // possible callees which precludes us from using a simple jump.
3539 if (Callee->isDivergent())
3540 return false;
3541
3543 const Function &CallerF = MF.getFunction();
3544 CallingConv::ID CallerCC = CallerF.getCallingConv();
3546 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3547
3548 // Kernels aren't callable, and don't have a live in return address so it
3549 // doesn't make sense to do a tail call with entry functions.
3550 if (!CallerPreserved)
3551 return false;
3552
3553 bool CCMatch = CallerCC == CalleeCC;
3554
3556 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3557 return true;
3558 return false;
3559 }
3560
3561 // TODO: Can we handle var args?
3562 if (IsVarArg)
3563 return false;
3564
3565 for (const Argument &Arg : CallerF.args()) {
3566 if (Arg.hasByValAttr())
3567 return false;
3568 }
3569
3570 LLVMContext &Ctx = *DAG.getContext();
3571
3572 // Check that the call results are passed in the same way.
3573 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3574 CCAssignFnForCall(CalleeCC, IsVarArg),
3575 CCAssignFnForCall(CallerCC, IsVarArg)))
3576 return false;
3577
3578 // The callee has to preserve all registers the caller needs to preserve.
3579 if (!CCMatch) {
3580 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3581 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3582 return false;
3583 }
3584
3585 // Nothing more to check if the callee is taking no arguments.
3586 if (Outs.empty())
3587 return true;
3588
3590 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3591
3592 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3593
3594 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3595 // If the stack arguments for this call do not fit into our own save area then
3596 // the call cannot be made tail.
3597 // TODO: Is this really necessary?
3598 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3599 return false;
3600
3601 const MachineRegisterInfo &MRI = MF.getRegInfo();
3602 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3603}
3604
3606 if (!CI->isTailCall())
3607 return false;
3608
3609 const Function *ParentFn = CI->getParent()->getParent();
3611 return false;
3612 return true;
3613}
3614
3615// The wave scratch offset register is used as the global base pointer.
3617 SmallVectorImpl<SDValue> &InVals) const {
3618 CallingConv::ID CallConv = CLI.CallConv;
3619 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3620
3621 SelectionDAG &DAG = CLI.DAG;
3622
3623 TargetLowering::ArgListEntry RequestedExec;
3624 if (IsChainCallConv) {
3625 // The last argument should be the value that we need to put in EXEC.
3626 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3627 // don't treat it like the rest of the arguments.
3628 RequestedExec = CLI.Args.back();
3629 assert(RequestedExec.Node && "No node for EXEC");
3630
3631 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3632 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3633
3634 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3635 CLI.Outs.pop_back();
3636 CLI.OutVals.pop_back();
3637
3638 if (RequestedExec.Ty->isIntegerTy(64)) {
3639 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3640 CLI.Outs.pop_back();
3641 CLI.OutVals.pop_back();
3642 }
3643
3644 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3645 "Haven't popped all the pieces of the EXEC mask");
3646 }
3647
3648 const SDLoc &DL = CLI.DL;
3650 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3652 SDValue Chain = CLI.Chain;
3653 SDValue Callee = CLI.Callee;
3654 bool &IsTailCall = CLI.IsTailCall;
3655 bool IsVarArg = CLI.IsVarArg;
3656 bool IsSibCall = false;
3658
3659 if (Callee.isUndef() || isNullConstant(Callee)) {
3660 if (!CLI.IsTailCall) {
3661 for (ISD::InputArg &Arg : CLI.Ins)
3662 InVals.push_back(DAG.getUNDEF(Arg.VT));
3663 }
3664
3665 return Chain;
3666 }
3667
3668 if (IsVarArg) {
3669 return lowerUnhandledCall(CLI, InVals,
3670 "unsupported call to variadic function ");
3671 }
3672
3673 if (!CLI.CB)
3674 report_fatal_error("unsupported libcall legalization");
3675
3676 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3677 return lowerUnhandledCall(CLI, InVals,
3678 "unsupported required tail call to function ");
3679 }
3680
3681 if (IsTailCall) {
3683 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3684 if (!IsTailCall &&
3685 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3686 report_fatal_error("failed to perform tail call elimination on a call "
3687 "site marked musttail or on llvm.amdgcn.cs.chain");
3688 }
3689
3690 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3691
3692 // A sibling call is one where we're under the usual C ABI and not planning
3693 // to change that but can still do a tail call:
3694 if (!TailCallOpt && IsTailCall)
3695 IsSibCall = true;
3696
3697 if (IsTailCall)
3698 ++NumTailCalls;
3699 }
3700
3703 SmallVector<SDValue, 8> MemOpChains;
3704
3705 // Analyze operands of the call, assigning locations to each operand.
3707 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3708 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3709
3710 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3711 // With a fixed ABI, allocate fixed registers before user arguments.
3712 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3713 }
3714
3715 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3716
3717 // Get a count of how many bytes are to be pushed on the stack.
3718 unsigned NumBytes = CCInfo.getStackSize();
3719
3720 if (IsSibCall) {
3721 // Since we're not changing the ABI to make this a tail call, the memory
3722 // operands are already available in the caller's incoming argument space.
3723 NumBytes = 0;
3724 }
3725
3726 // FPDiff is the byte offset of the call's argument area from the callee's.
3727 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3728 // by this amount for a tail call. In a sibling call it must be 0 because the
3729 // caller will deallocate the entire stack and the callee still expects its
3730 // arguments to begin at SP+0. Completely unused for non-tail calls.
3731 int32_t FPDiff = 0;
3732 MachineFrameInfo &MFI = MF.getFrameInfo();
3733
3734 // Adjust the stack pointer for the new arguments...
3735 // These operations are automatically eliminated by the prolog/epilog pass
3736 if (!IsSibCall)
3737 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3738
3739 if (!IsSibCall || IsChainCallConv) {
3740 if (!Subtarget->enableFlatScratch()) {
3741 SmallVector<SDValue, 4> CopyFromChains;
3742
3743 // In the HSA case, this should be an identity copy.
3744 SDValue ScratchRSrcReg
3745 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3746 RegsToPass.emplace_back(IsChainCallConv
3747 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3748 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3749 ScratchRSrcReg);
3750 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3751 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3752 }
3753 }
3754
3755 MVT PtrVT = MVT::i32;
3756
3757 // Walk the register/memloc assignments, inserting copies/loads.
3758 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3759 CCValAssign &VA = ArgLocs[i];
3760 SDValue Arg = OutVals[i];
3761
3762 // Promote the value if needed.
3763 switch (VA.getLocInfo()) {
3764 case CCValAssign::Full:
3765 break;
3766 case CCValAssign::BCvt:
3767 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3768 break;
3769 case CCValAssign::ZExt:
3770 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3771 break;
3772 case CCValAssign::SExt:
3773 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3774 break;
3775 case CCValAssign::AExt:
3776 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3777 break;
3778 case CCValAssign::FPExt:
3779 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3780 break;
3781 default:
3782 llvm_unreachable("Unknown loc info!");
3783 }
3784
3785 if (VA.isRegLoc()) {
3786 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3787 } else {
3788 assert(VA.isMemLoc());
3789
3790 SDValue DstAddr;
3791 MachinePointerInfo DstInfo;
3792
3793 unsigned LocMemOffset = VA.getLocMemOffset();
3794 int32_t Offset = LocMemOffset;
3795
3796 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3797 MaybeAlign Alignment;
3798
3799 if (IsTailCall) {
3800 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3801 unsigned OpSize = Flags.isByVal() ?
3802 Flags.getByValSize() : VA.getValVT().getStoreSize();
3803
3804 // FIXME: We can have better than the minimum byval required alignment.
3805 Alignment =
3806 Flags.isByVal()
3807 ? Flags.getNonZeroByValAlign()
3808 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3809
3810 Offset = Offset + FPDiff;
3811 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3812
3813 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3814 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3815
3816 // Make sure any stack arguments overlapping with where we're storing
3817 // are loaded before this eventual operation. Otherwise they'll be
3818 // clobbered.
3819
3820 // FIXME: Why is this really necessary? This seems to just result in a
3821 // lot of code to copy the stack and write them back to the same
3822 // locations, which are supposed to be immutable?
3823 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3824 } else {
3825 // Stores to the argument stack area are relative to the stack pointer.
3826 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3827 MVT::i32);
3828 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3829 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3830 Alignment =
3831 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3832 }
3833
3834 if (Outs[i].Flags.isByVal()) {
3835 SDValue SizeNode =
3836 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3837 SDValue Cpy =
3838 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3839 Outs[i].Flags.getNonZeroByValAlign(),
3840 /*isVol = */ false, /*AlwaysInline = */ true,
3841 /*CI=*/nullptr, std::nullopt, DstInfo,
3843
3844 MemOpChains.push_back(Cpy);
3845 } else {
3846 SDValue Store =
3847 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3848 MemOpChains.push_back(Store);
3849 }
3850 }
3851 }
3852
3853 if (!MemOpChains.empty())
3854 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3855
3856 // Build a sequence of copy-to-reg nodes chained together with token chain
3857 // and flag operands which copy the outgoing args into the appropriate regs.
3858 SDValue InGlue;
3859 for (auto &RegToPass : RegsToPass) {
3860 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3861 RegToPass.second, InGlue);
3862 InGlue = Chain.getValue(1);
3863 }
3864
3865
3866 // We don't usually want to end the call-sequence here because we would tidy
3867 // the frame up *after* the call, however in the ABI-changing tail-call case
3868 // we've carefully laid out the parameters so that when sp is reset they'll be
3869 // in the correct location.
3870 if (IsTailCall && !IsSibCall) {
3871 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3872 InGlue = Chain.getValue(1);
3873 }
3874
3875 std::vector<SDValue> Ops;
3876 Ops.push_back(Chain);
3877 Ops.push_back(Callee);
3878 // Add a redundant copy of the callee global which will not be legalized, as
3879 // we need direct access to the callee later.
3880 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3881 const GlobalValue *GV = GSD->getGlobal();
3882 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3883 } else {
3884 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3885 }
3886
3887 if (IsTailCall) {
3888 // Each tail call may have to adjust the stack by a different amount, so
3889 // this information must travel along with the operation for eventual
3890 // consumption by emitEpilogue.
3891 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3892 }
3893
3894 if (IsChainCallConv)
3895 Ops.push_back(RequestedExec.Node);
3896
3897 // Add argument registers to the end of the list so that they are known live
3898 // into the call.
3899 for (auto &RegToPass : RegsToPass) {
3900 Ops.push_back(DAG.getRegister(RegToPass.first,
3901 RegToPass.second.getValueType()));
3902 }
3903
3904 // Add a register mask operand representing the call-preserved registers.
3905 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3906 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3907 assert(Mask && "Missing call preserved mask for calling convention");
3908 Ops.push_back(DAG.getRegisterMask(Mask));
3909
3910 if (SDValue Token = CLI.ConvergenceControlToken) {
3912 GlueOps.push_back(Token);
3913 if (InGlue)
3914 GlueOps.push_back(InGlue);
3915
3916 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3917 MVT::Glue, GlueOps),
3918 0);
3919 }
3920
3921 if (InGlue)
3922 Ops.push_back(InGlue);
3923
3924 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3925
3926 // If we're doing a tall call, use a TC_RETURN here rather than an
3927 // actual call instruction.
3928 if (IsTailCall) {
3929 MFI.setHasTailCall();
3930 unsigned OPC = AMDGPUISD::TC_RETURN;
3931 switch (CallConv) {
3934 break;
3938 break;
3939 }
3940
3941 return DAG.getNode(OPC, DL, NodeTys, Ops);
3942 }
3943
3944 // Returns a chain and a flag for retval copy to use.
3945 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3946 Chain = Call.getValue(0);
3947 InGlue = Call.getValue(1);
3948
3949 uint64_t CalleePopBytes = NumBytes;
3950 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3951 if (!Ins.empty())
3952 InGlue = Chain.getValue(1);
3953
3954 // Handle result values, copying them out of physregs into vregs that we
3955 // return.
3956 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3957 InVals, /*IsThisReturn=*/false, SDValue());
3958}
3959
3960// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3961// except for applying the wave size scale to the increment amount.
3963 SDValue Op, SelectionDAG &DAG) const {
3964 const MachineFunction &MF = DAG.getMachineFunction();
3966
3967 SDLoc dl(Op);
3968 EVT VT = Op.getValueType();
3969 SDValue Tmp1 = Op;
3970 SDValue Tmp2 = Op.getValue(1);
3971 SDValue Tmp3 = Op.getOperand(2);
3972 SDValue Chain = Tmp1.getOperand(0);
3973
3974 Register SPReg = Info->getStackPtrOffsetReg();
3975
3976 // Chain the dynamic stack allocation so that it doesn't modify the stack
3977 // pointer when other instructions are using the stack.
3978 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3979
3980 SDValue Size = Tmp2.getOperand(1);
3981 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3982 Chain = SP.getValue(1);
3983 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3984 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3985 unsigned Opc =
3988
3989 SDValue ScaledSize = DAG.getNode(
3990 ISD::SHL, dl, VT, Size,
3991 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3992
3993 Align StackAlign = TFL->getStackAlign();
3994 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3995 if (Alignment && *Alignment > StackAlign) {
3996 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3997 DAG.getConstant(-(uint64_t)Alignment->value()
3998 << Subtarget->getWavefrontSizeLog2(),
3999 dl, VT));
4000 }
4001
4002 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4003 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4004
4005 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4006}
4007
4009 SelectionDAG &DAG) const {
4010 // We only handle constant sizes here to allow non-entry block, static sized
4011 // allocas. A truly dynamic value is more difficult to support because we
4012 // don't know if the size value is uniform or not. If the size isn't uniform,
4013 // we would need to do a wave reduction to get the maximum size to know how
4014 // much to increment the uniform stack pointer.
4015 SDValue Size = Op.getOperand(1);
4016 if (isa<ConstantSDNode>(Size))
4017 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4018
4020}
4021
4023 if (Op.getValueType() != MVT::i32)
4024 return Op; // Defer to cannot select error.
4025
4027 SDLoc SL(Op);
4028
4029 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4030
4031 // Convert from wave uniform to swizzled vector address. This should protect
4032 // from any edge cases where the stacksave result isn't directly used with
4033 // stackrestore.
4034 SDValue VectorAddress =
4035 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4036 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4037}
4038
4040 SelectionDAG &DAG) const {
4041 SDLoc SL(Op);
4042 assert(Op.getValueType() == MVT::i32);
4043
4044 uint32_t BothRoundHwReg =
4046 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4047
4048 SDValue IntrinID =
4049 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4050 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4051 Op.getOperand(0), IntrinID, GetRoundBothImm);
4052
4053 // There are two rounding modes, one for f32 and one for f64/f16. We only
4054 // report in the standard value range if both are the same.
4055 //
4056 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4057 // ties away from zero is not supported, and the other values are rotated by
4058 // 1.
4059 //
4060 // If the two rounding modes are not the same, report a target defined value.
4061
4062 // Mode register rounding mode fields:
4063 //
4064 // [1:0] Single-precision round mode.
4065 // [3:2] Double/Half-precision round mode.
4066 //
4067 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4068 //
4069 // Hardware Spec
4070 // Toward-0 3 0
4071 // Nearest Even 0 1
4072 // +Inf 1 2
4073 // -Inf 2 3
4074 // NearestAway0 N/A 4
4075 //
4076 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4077 // table we can index by the raw hardware mode.
4078 //
4079 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4080
4081 SDValue BitTable =
4083
4084 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4085 SDValue RoundModeTimesNumBits =
4086 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4087
4088 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4089 // knew only one mode was demanded.
4090 SDValue TableValue =
4091 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4092 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4093
4094 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4095 SDValue TableEntry =
4096 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4097
4098 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4099 // if it's an extended value.
4100 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4101 SDValue IsStandardValue =
4102 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4103 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4104 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4105 TableEntry, EnumOffset);
4106
4107 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4108}
4109
4111 SelectionDAG &DAG) const {
4112 SDLoc SL(Op);
4113
4114 SDValue NewMode = Op.getOperand(1);
4115 assert(NewMode.getValueType() == MVT::i32);
4116
4117 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4118 // hardware MODE.fp_round values.
4119 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4120 uint32_t ClampedVal = std::min(
4121 static_cast<uint32_t>(ConstMode->getZExtValue()),
4123 NewMode = DAG.getConstant(
4124 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4125 } else {
4126 // If we know the input can only be one of the supported standard modes in
4127 // the range 0-3, we can use a simplified mapping to hardware values.
4128 KnownBits KB = DAG.computeKnownBits(NewMode);
4129 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4130 // The supported standard values are 0-3. The extended values start at 8. We
4131 // need to offset by 4 if the value is in the extended range.
4132
4133 if (UseReducedTable) {
4134 // Truncate to the low 32-bits.
4135 SDValue BitTable = DAG.getConstant(
4136 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4137
4138 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4139 SDValue RoundModeTimesNumBits =
4140 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4141
4142 NewMode =
4143 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4144
4145 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4146 // the table extracted bits into inline immediates.
4147 } else {
4148 // table_index = umin(value, value - 4)
4149 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4150 SDValue BitTable =
4152
4153 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4154 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4155 SDValue IndexVal =
4156 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4157
4158 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4159 SDValue RoundModeTimesNumBits =
4160 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4161
4162 SDValue TableValue =
4163 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4164 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4165
4166 // No need to mask out the high bits since the setreg will ignore them
4167 // anyway.
4168 NewMode = TruncTable;
4169 }
4170
4171 // Insert a readfirstlane in case the value is a VGPR. We could do this
4172 // earlier and keep more operations scalar, but that interferes with
4173 // combining the source.
4174 SDValue ReadFirstLaneID =
4175 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4176 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4177 ReadFirstLaneID, NewMode);
4178 }
4179
4180 // N.B. The setreg will be later folded into s_round_mode on supported
4181 // targets.
4182 SDValue IntrinID =
4183 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4184 uint32_t BothRoundHwReg =
4186 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4187
4188 SDValue SetReg =
4189 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4190 IntrinID, RoundBothImm, NewMode);
4191
4192 return SetReg;
4193}
4194
4196 if (Op->isDivergent())
4197 return SDValue();
4198
4199 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4204 break;
4205 default:
4206 return SDValue();
4207 }
4208
4209 return Op;
4210}
4211
4212// Work around DAG legality rules only based on the result type.
4214 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4215 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4216 EVT SrcVT = Src.getValueType();
4217
4218 if (SrcVT.getScalarType() != MVT::bf16)
4219 return Op;
4220
4221 SDLoc SL(Op);
4222 SDValue BitCast =
4223 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4224
4225 EVT DstVT = Op.getValueType();
4226 if (IsStrict)
4227 llvm_unreachable("Need STRICT_BF16_TO_FP");
4228
4229 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4230}
4231
4233 SDLoc SL(Op);
4234 if (Op.getValueType() != MVT::i64)
4235 return Op;
4236
4237 uint32_t ModeHwReg =
4239 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4240 uint32_t TrapHwReg =
4242 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4243
4244 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4245 SDValue IntrinID =
4246 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4247 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4248 Op.getOperand(0), IntrinID, ModeHwRegImm);
4249 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4250 Op.getOperand(0), IntrinID, TrapHwRegImm);
4251 SDValue TokenReg =
4252 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4253 GetTrapReg.getValue(1));
4254
4255 SDValue CvtPtr =
4256 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4257 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4258
4259 return DAG.getMergeValues({Result, TokenReg}, SL);
4260}
4261
4263 SDLoc SL(Op);
4264 if (Op.getOperand(1).getValueType() != MVT::i64)
4265 return Op;
4266
4267 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4268 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4269 DAG.getConstant(0, SL, MVT::i32));
4270 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4271 DAG.getConstant(1, SL, MVT::i32));
4272
4273 SDValue ReadFirstLaneID =
4274 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4275 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4276 ReadFirstLaneID, NewModeReg);
4277 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4278 ReadFirstLaneID, NewTrapReg);
4279
4280 unsigned ModeHwReg =
4282 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4283 unsigned TrapHwReg =
4285 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4286
4287 SDValue IntrinID =
4288 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4289 SDValue SetModeReg =
4290 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4291 IntrinID, ModeHwRegImm, NewModeReg);
4292 SDValue SetTrapReg =
4293 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4294 IntrinID, TrapHwRegImm, NewTrapReg);
4295 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4296}
4297
4299 const MachineFunction &MF) const {
4301 .Case("m0", AMDGPU::M0)
4302 .Case("exec", AMDGPU::EXEC)
4303 .Case("exec_lo", AMDGPU::EXEC_LO)
4304 .Case("exec_hi", AMDGPU::EXEC_HI)
4305 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4306 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4307 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4308 .Default(Register());
4309
4310 if (Reg == AMDGPU::NoRegister) {
4311 report_fatal_error(Twine("invalid register name \""
4312 + StringRef(RegName) + "\"."));
4313
4314 }
4315
4316 if (!Subtarget->hasFlatScrRegister() &&
4317 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4318 report_fatal_error(Twine("invalid register \""
4319 + StringRef(RegName) + "\" for subtarget."));
4320 }
4321
4322 switch (Reg) {
4323 case AMDGPU::M0:
4324 case AMDGPU::EXEC_LO:
4325 case AMDGPU::EXEC_HI:
4326 case AMDGPU::FLAT_SCR_LO:
4327 case AMDGPU::FLAT_SCR_HI:
4328 if (VT.getSizeInBits() == 32)
4329