LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236 }
237
238 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
239 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
240 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
241 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
242 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
243 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
244 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
245 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
246 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
247 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
248 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
249 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
250 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
251 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
252 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
253 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
254
255 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
256 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
257 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
261 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
262
263 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
264
268 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
269
270 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
271
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
274
276 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
277 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
278
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283 Expand);
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288 Expand);
289
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
293 Custom);
294
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
298
300
302
304 Expand);
305
306#if 0
308#endif
309
310 // We only support LOAD/STORE and vector manipulation ops for vectors
311 // with > 4 elements.
312 for (MVT VT :
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322 switch (Op) {
323 case ISD::LOAD:
324 case ISD::STORE:
326 case ISD::BITCAST:
327 case ISD::UNDEF:
331 case ISD::IS_FPCLASS:
332 break;
337 break;
338 default:
340 break;
341 }
342 }
343 }
344
346
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348 // is expanded to avoid having two separate loops in case the index is a VGPR.
349
350 // Most operations are naturally 32-bit vector operations. We only support
351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
355
357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
364 }
365
366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
378 }
379
380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
392 }
393
394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
406 }
407
408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
420 }
421
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424 Expand);
425
426 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427 Custom);
428
429 // Avoid stack access for these.
430 // TODO: Generalize to more vector types.
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434 Custom);
435
436 // Deal with vec3 vector operations when widened to vec4.
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
439
440 // Deal with vec5/6/7 vector operations when widened to vec8.
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446 Custom);
447
448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449 // and output demarshalling
450 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
451
452 // We can't return success/failure, only the old value,
453 // let LLVM add the comparison
455 Expand);
456
457 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
458
459 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
460
461 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // illegal.
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
465
466 // On SI this is s_memtime and s_memrealtime on VI.
468
469 if (Subtarget->hasSMemRealTime() ||
473
474 if (Subtarget->has16BitInsts()) {
477 } else {
479 }
480
481 if (Subtarget->hasMadMacF32Insts())
483
484 if (!Subtarget->hasBFI())
485 // fcopysign can be done in a single instruction with BFI.
486 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
487
488 if (!Subtarget->hasBCNT(32))
490
491 if (!Subtarget->hasBCNT(64))
493
494 if (Subtarget->hasFFBH())
496
497 if (Subtarget->hasFFBL())
499
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 //
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503 // effort to match them now. We want this to be false for i64 cases when the
504 // extraction isn't restricted to the upper or lower half. Ideally we would
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506 // span the midpoint are probably relatively rare, so don't worry about them
507 // for now.
508 if (Subtarget->hasBFE())
510
511 // Clamp modifier on add/sub
512 if (Subtarget->hasIntClamp())
514
515 if (Subtarget->hasAddNoCarry())
516 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
517 Legal);
518
519 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
520 Custom);
521
522 // These are really only legal for ieee_mode functions. We should be avoiding
523 // them for functions that don't have ieee_mode enabled, so just say they are
524 // legal.
526 {MVT::f32, MVT::f64}, Legal);
527
528 if (Subtarget->haveRoundOpsF64())
530 Legal);
531 else
533 MVT::f64, Custom);
534
536 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
537 Legal);
538 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
539
542
543 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545
546 // Custom lower these because we can't specify a rule based on an illegal
547 // source bf16.
550
551 if (Subtarget->has16BitInsts()) {
554 MVT::i16, Legal);
555
556 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
557
559 MVT::i16, Expand);
560
564 ISD::CTPOP},
565 MVT::i16, Promote);
566
568
569 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
570
572 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
574 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
575
579
581
582 // F16 - Constant Actions.
585
586 // F16 - Load/Store Actions.
588 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
590 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
591
592 // BF16 - Load/Store Actions.
594 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
596 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
597
598 // F16 - VOP1 Actions.
601 MVT::f16, Custom);
602
605
606 // F16 - VOP2 Actions.
607 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
608 Expand);
612
613 // F16 - VOP3 Actions.
615 if (STI.hasMadF16())
617
618 for (MVT VT :
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623 switch (Op) {
624 case ISD::LOAD:
625 case ISD::STORE:
627 case ISD::BITCAST:
628 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
638 break;
639 default:
641 break;
642 }
643 }
644 }
645
646 // v_perm_b32 can handle either of these.
647 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
649
650 // XXX - Do these do anything? Vector constants turn into build_vector.
651 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
652
653 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654 Legal);
655
657 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
659 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
660
662 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
664 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
665
666 setOperationAction(ISD::AND, MVT::v2i16, Promote);
667 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
668 setOperationAction(ISD::OR, MVT::v2i16, Promote);
669 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
670 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
672
674 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
676 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
677 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
679
681 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
683 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
685 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
686
688 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
690 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
691 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
693
695 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
697 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
698
700 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
702 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
704 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
705
706 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
708 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
710 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
712
714 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
716 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
719
720 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
722 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
724 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
726
728 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
730 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
733
735 MVT::v2i32, Expand);
737
739 MVT::v4i32, Expand);
740
742 MVT::v8i32, Expand);
743
744 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745 Subtarget->hasVOP3PInsts() ? Legal : Custom);
746
747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751
754
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 }
859
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
863 Custom);
864
866 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
867 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
868 MVT::i16, MVT::i8, MVT::i128},
869 Custom);
870
872 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
873 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
874 MVT::i8, MVT::i128},
875 Custom);
876
882
883 // TODO: Could move this to custom lowering, could benefit from combines on
884 // extract of relevant bits.
886
888
891 ISD::SUB,
893 ISD::FADD,
894 ISD::FSUB,
895 ISD::FDIV,
902 ISD::FMA,
903 ISD::SMIN,
904 ISD::SMAX,
905 ISD::UMIN,
906 ISD::UMAX,
908 ISD::AND,
909 ISD::OR,
910 ISD::XOR,
911 ISD::FSHR,
921
922 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
924
925 // All memory operations. Some folding on the pointer operand is done to help
926 // matching the constant offsets in the addressing modes.
949
950 // FIXME: In other contexts we pretend this is a per-function property.
952
954}
955
957 return Subtarget;
958}
959
961 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
962 return RCRegs;
963}
964
965//===----------------------------------------------------------------------===//
966// TargetLowering queries
967//===----------------------------------------------------------------------===//
968
969// v_mad_mix* support a conversion from f16 to f32.
970//
971// There is only one special case when denormals are enabled we don't currently,
972// where this is OK to use.
973bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
974 EVT DestVT, EVT SrcVT) const {
975 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
976 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
977 DestVT.getScalarType() == MVT::f32 &&
978 SrcVT.getScalarType() == MVT::f16 &&
979 // TODO: This probably only requires no input flushing?
981}
982
984 LLT DestTy, LLT SrcTy) const {
985 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
986 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
987 DestTy.getScalarSizeInBits() == 32 &&
988 SrcTy.getScalarSizeInBits() == 16 &&
989 // TODO: This probably only requires no input flushing?
991}
992
994 // SI has some legal vector types, but no legal vector operations. Say no
995 // shuffles are legal in order to prefer scalarizing some vector operations.
996 return false;
997}
998
1001 EVT VT) const {
1004
1005 if (VT.isVector()) {
1006 EVT ScalarVT = VT.getScalarType();
1007 unsigned Size = ScalarVT.getSizeInBits();
1008 if (Size == 16) {
1009 if (Subtarget->has16BitInsts()) {
1010 if (VT.isInteger())
1011 return MVT::v2i16;
1012 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1013 }
1014 return VT.isInteger() ? MVT::i32 : MVT::f32;
1015 }
1016
1017 if (Size < 16)
1018 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1019 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1020 }
1021
1022 if (VT.getSizeInBits() > 32)
1023 return MVT::i32;
1024
1026}
1027
1030 EVT VT) const {
1033
1034 if (VT.isVector()) {
1035 unsigned NumElts = VT.getVectorNumElements();
1036 EVT ScalarVT = VT.getScalarType();
1037 unsigned Size = ScalarVT.getSizeInBits();
1038
1039 // FIXME: Should probably promote 8-bit vectors to i16.
1040 if (Size == 16 && Subtarget->has16BitInsts())
1041 return (NumElts + 1) / 2;
1042
1043 if (Size <= 32)
1044 return NumElts;
1045
1046 if (Size > 32)
1047 return NumElts * ((Size + 31) / 32);
1048 } else if (VT.getSizeInBits() > 32)
1049 return (VT.getSizeInBits() + 31) / 32;
1050
1052}
1053
1055 LLVMContext &Context, CallingConv::ID CC,
1056 EVT VT, EVT &IntermediateVT,
1057 unsigned &NumIntermediates, MVT &RegisterVT) const {
1058 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1059 unsigned NumElts = VT.getVectorNumElements();
1060 EVT ScalarVT = VT.getScalarType();
1061 unsigned Size = ScalarVT.getSizeInBits();
1062 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1063 // support, but unless we can properly handle 3-vectors, it will be still be
1064 // inconsistent.
1065 if (Size == 16 && Subtarget->has16BitInsts()) {
1066 if (ScalarVT == MVT::bf16) {
1067 RegisterVT = MVT::i32;
1068 IntermediateVT = MVT::v2bf16;
1069 } else {
1070 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1071 IntermediateVT = RegisterVT;
1072 }
1073 NumIntermediates = (NumElts + 1) / 2;
1074 return NumIntermediates;
1075 }
1076
1077 if (Size == 32) {
1078 RegisterVT = ScalarVT.getSimpleVT();
1079 IntermediateVT = RegisterVT;
1080 NumIntermediates = NumElts;
1081 return NumIntermediates;
1082 }
1083
1084 if (Size < 16 && Subtarget->has16BitInsts()) {
1085 // FIXME: Should probably form v2i16 pieces
1086 RegisterVT = MVT::i16;
1087 IntermediateVT = ScalarVT;
1088 NumIntermediates = NumElts;
1089 return NumIntermediates;
1090 }
1091
1092
1093 if (Size != 16 && Size <= 32) {
1094 RegisterVT = MVT::i32;
1095 IntermediateVT = ScalarVT;
1096 NumIntermediates = NumElts;
1097 return NumIntermediates;
1098 }
1099
1100 if (Size > 32) {
1101 RegisterVT = MVT::i32;
1102 IntermediateVT = RegisterVT;
1103 NumIntermediates = NumElts * ((Size + 31) / 32);
1104 return NumIntermediates;
1105 }
1106 }
1107
1109 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1110}
1111
1112static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1113 assert(MaxNumLanes != 0);
1114
1115 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1116 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1117 return EVT::getVectorVT(Ty->getContext(),
1118 EVT::getEVT(VT->getElementType()),
1119 NumElts);
1120 }
1121
1122 return EVT::getEVT(Ty);
1123}
1124
1125// Peek through TFE struct returns to only use the data size.
1126static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1127 auto *ST = dyn_cast<StructType>(Ty);
1128 if (!ST)
1129 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1130
1131 // TFE intrinsics return an aggregate type.
1132 assert(ST->getNumContainedTypes() == 2 &&
1133 ST->getContainedType(1)->isIntegerTy(32));
1134 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1135}
1136
1137/// Map address space 7 to MVT::v5i32 because that's its in-memory
1138/// representation. This return value is vector-typed because there is no
1139/// MVT::i160 and it is not clear if one can be added. While this could
1140/// cause issues during codegen, these address space 7 pointers will be
1141/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1142/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1143/// modeling, to work.
1145 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1146 return MVT::v5i32;
1148 DL.getPointerSizeInBits(AS) == 192)
1149 return MVT::v6i32;
1151}
1152/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1153/// v8i32 when padding is added.
1154/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1155/// also v8i32 with padding.
1157 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1158 DL.getPointerSizeInBits(AS) == 160) ||
1160 DL.getPointerSizeInBits(AS) == 192))
1161 return MVT::v8i32;
1163}
1164
1166 const CallInst &CI,
1167 MachineFunction &MF,
1168 unsigned IntrID) const {
1170 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1172
1173 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1176 (Intrinsic::ID)IntrID);
1177 MemoryEffects ME = Attr.getMemoryEffects();
1178 if (ME.doesNotAccessMemory())
1179 return false;
1180
1181 // TODO: Should images get their own address space?
1182 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1183
1184 if (RsrcIntr->IsImage)
1185 Info.align.reset();
1186
1187 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1188 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1189 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1190 // We conservatively set the memory operand of a buffer intrinsic to the
1191 // base resource pointer, so that we can access alias information about
1192 // those pointers. Cases like "this points at the same value
1193 // but with a different offset" are handled in
1194 // areMemAccessesTriviallyDisjoint.
1195 Info.ptrVal = RsrcArg;
1196 }
1197
1198 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1199 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1202 if (ME.onlyReadsMemory()) {
1203 unsigned MaxNumLanes = 4;
1204
1205 if (RsrcIntr->IsImage) {
1208 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1210
1211 if (!BaseOpcode->Gather4) {
1212 // If this isn't a gather, we may have excess loaded elements in the
1213 // IR type. Check the dmask for the real number of elements loaded.
1214 unsigned DMask
1215 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1216 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1217 }
1218 }
1219
1220 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1221
1222 // FIXME: What does alignment mean for an image?
1225 } else if (ME.onlyWritesMemory()) {
1227
1228 Type *DataTy = CI.getArgOperand(0)->getType();
1229 if (RsrcIntr->IsImage) {
1230 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1231 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1232 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1233 } else
1234 Info.memVT = EVT::getEVT(DataTy);
1235
1237 } else {
1238 // Atomic
1239 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1244
1245 switch (IntrID) {
1246 default:
1247 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1248 // XXX - Should this be volatile without known ordering?
1250 break;
1251 case Intrinsic::amdgcn_raw_buffer_load_lds:
1252 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1253 case Intrinsic::amdgcn_struct_buffer_load_lds:
1254 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1255 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1256 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1257 Info.ptrVal = CI.getArgOperand(1);
1258 return true;
1259 }
1260 }
1261 }
1262 return true;
1263 }
1264
1265 switch (IntrID) {
1266 case Intrinsic::amdgcn_ds_ordered_add:
1267 case Intrinsic::amdgcn_ds_ordered_swap:
1268 case Intrinsic::amdgcn_ds_fadd:
1269 case Intrinsic::amdgcn_ds_fmin:
1270 case Intrinsic::amdgcn_ds_fmax: {
1272 Info.memVT = MVT::getVT(CI.getType());
1273 Info.ptrVal = CI.getOperand(0);
1274 Info.align.reset();
1276
1277 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1278 if (!Vol->isZero())
1280
1281 return true;
1282 }
1283 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1285 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1286 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1287 Info.align.reset();
1289
1290 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1291 if (!Vol || !Vol->isZero())
1293
1294 return true;
1295 }
1296 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1297 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1299 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1300 Info.ptrVal = nullptr;
1301 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1303 return true;
1304 }
1305 case Intrinsic::amdgcn_ds_append:
1306 case Intrinsic::amdgcn_ds_consume: {
1308 Info.memVT = MVT::getVT(CI.getType());
1309 Info.ptrVal = CI.getOperand(0);
1310 Info.align.reset();
1312
1313 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1314 if (!Vol->isZero())
1316
1317 return true;
1318 }
1319 case Intrinsic::amdgcn_global_atomic_csub: {
1321 Info.memVT = MVT::getVT(CI.getType());
1322 Info.ptrVal = CI.getOperand(0);
1323 Info.align.reset();
1327 return true;
1328 }
1329 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1331 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1332
1333 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1334 Info.align.reset();
1337 return true;
1338 }
1339 case Intrinsic::amdgcn_global_atomic_fadd:
1340 case Intrinsic::amdgcn_global_atomic_fmin:
1341 case Intrinsic::amdgcn_global_atomic_fmax:
1342 case Intrinsic::amdgcn_global_atomic_fmin_num:
1343 case Intrinsic::amdgcn_global_atomic_fmax_num:
1344 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1345 case Intrinsic::amdgcn_flat_atomic_fadd:
1346 case Intrinsic::amdgcn_flat_atomic_fmin:
1347 case Intrinsic::amdgcn_flat_atomic_fmax:
1348 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1349 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1350 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1351 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1352 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1354 Info.memVT = MVT::getVT(CI.getType());
1355 Info.ptrVal = CI.getOperand(0);
1356 Info.align.reset();
1361 return true;
1362 }
1363 case Intrinsic::amdgcn_global_load_tr_b64:
1364 case Intrinsic::amdgcn_global_load_tr_b128: {
1366 Info.memVT = MVT::getVT(CI.getType());
1367 Info.ptrVal = CI.getOperand(0);
1368 Info.align.reset();
1370 return true;
1371 }
1372 case Intrinsic::amdgcn_ds_gws_init:
1373 case Intrinsic::amdgcn_ds_gws_barrier:
1374 case Intrinsic::amdgcn_ds_gws_sema_v:
1375 case Intrinsic::amdgcn_ds_gws_sema_br:
1376 case Intrinsic::amdgcn_ds_gws_sema_p:
1377 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1379
1380 const GCNTargetMachine &TM =
1381 static_cast<const GCNTargetMachine &>(getTargetMachine());
1382
1384 Info.ptrVal = MFI->getGWSPSV(TM);
1385
1386 // This is an abstract access, but we need to specify a type and size.
1387 Info.memVT = MVT::i32;
1388 Info.size = 4;
1389 Info.align = Align(4);
1390
1391 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1393 else
1395 return true;
1396 }
1397 case Intrinsic::amdgcn_global_load_lds: {
1399 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1400 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1401 Info.ptrVal = CI.getArgOperand(1);
1403 return true;
1404 }
1405 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1407
1408 const GCNTargetMachine &TM =
1409 static_cast<const GCNTargetMachine &>(getTargetMachine());
1410
1412 Info.ptrVal = MFI->getGWSPSV(TM);
1413
1414 // This is an abstract access, but we need to specify a type and size.
1415 Info.memVT = MVT::i32;
1416 Info.size = 4;
1417 Info.align = Align(4);
1418
1420 return true;
1421 }
1422 default:
1423 return false;
1424 }
1425}
1426
1428 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1429 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1430 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1431 // The DAG's ValueType loses the addrspaces.
1432 // Add them as 2 extra Constant operands "from" and "to".
1433 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1434 unsigned DstAS = I.getType()->getPointerAddressSpace();
1435 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1436 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1437 break;
1438 }
1439 default:
1440 break;
1441 }
1442}
1443
1446 Type *&AccessTy) const {
1447 Value *Ptr = nullptr;
1448 switch (II->getIntrinsicID()) {
1449 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1450 case Intrinsic::amdgcn_ds_append:
1451 case Intrinsic::amdgcn_ds_consume:
1452 case Intrinsic::amdgcn_ds_fadd:
1453 case Intrinsic::amdgcn_ds_fmax:
1454 case Intrinsic::amdgcn_ds_fmin:
1455 case Intrinsic::amdgcn_ds_ordered_add:
1456 case Intrinsic::amdgcn_ds_ordered_swap:
1457 case Intrinsic::amdgcn_flat_atomic_fadd:
1458 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1459 case Intrinsic::amdgcn_flat_atomic_fmax:
1460 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1461 case Intrinsic::amdgcn_flat_atomic_fmin:
1462 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1463 case Intrinsic::amdgcn_global_atomic_csub:
1464 case Intrinsic::amdgcn_global_atomic_fadd:
1465 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1466 case Intrinsic::amdgcn_global_atomic_fmax:
1467 case Intrinsic::amdgcn_global_atomic_fmax_num:
1468 case Intrinsic::amdgcn_global_atomic_fmin:
1469 case Intrinsic::amdgcn_global_atomic_fmin_num:
1470 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1471 case Intrinsic::amdgcn_global_load_tr_b64:
1472 case Intrinsic::amdgcn_global_load_tr_b128:
1473 Ptr = II->getArgOperand(0);
1474 break;
1475 case Intrinsic::amdgcn_global_load_lds:
1476 Ptr = II->getArgOperand(1);
1477 break;
1478 default:
1479 return false;
1480 }
1481 AccessTy = II->getType();
1482 Ops.push_back(Ptr);
1483 return true;
1484}
1485
1486bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1487 unsigned AddrSpace,
1488 uint64_t FlatVariant) const {
1489 if (!Subtarget->hasFlatInstOffsets()) {
1490 // Flat instructions do not have offsets, and only have the register
1491 // address.
1492 return AM.BaseOffs == 0 && AM.Scale == 0;
1493 }
1494
1495 return AM.Scale == 0 &&
1496 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1497 AM.BaseOffs, AddrSpace, FlatVariant));
1498}
1499
1501 if (Subtarget->hasFlatGlobalInsts())
1502 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1504
1505 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1506 // Assume the we will use FLAT for all global memory accesses
1507 // on VI.
1508 // FIXME: This assumption is currently wrong. On VI we still use
1509 // MUBUF instructions for the r + i addressing mode. As currently
1510 // implemented, the MUBUF instructions only work on buffer < 4GB.
1511 // It may be possible to support > 4GB buffers with MUBUF instructions,
1512 // by setting the stride value in the resource descriptor which would
1513 // increase the size limit to (stride * 4GB). However, this is risky,
1514 // because it has never been validated.
1515 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1517 }
1518
1519 return isLegalMUBUFAddressingMode(AM);
1520}
1521
1522bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1523 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1524 // additionally can do r + r + i with addr64. 32-bit has more addressing
1525 // mode options. Depending on the resource constant, it can also do
1526 // (i64 r0) + (i32 r1) * (i14 i).
1527 //
1528 // Private arrays end up using a scratch buffer most of the time, so also
1529 // assume those use MUBUF instructions. Scratch loads / stores are currently
1530 // implemented as mubuf instructions with offen bit set, so slightly
1531 // different than the normal addr64.
1532 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1533 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1534 return false;
1535
1536 // FIXME: Since we can split immediate into soffset and immediate offset,
1537 // would it make sense to allow any immediate?
1538
1539 switch (AM.Scale) {
1540 case 0: // r + i or just i, depending on HasBaseReg.
1541 return true;
1542 case 1:
1543 return true; // We have r + r or r + i.
1544 case 2:
1545 if (AM.HasBaseReg) {
1546 // Reject 2 * r + r.
1547 return false;
1548 }
1549
1550 // Allow 2 * r as r + r
1551 // Or 2 * r + i is allowed as r + r + i.
1552 return true;
1553 default: // Don't allow n * r
1554 return false;
1555 }
1556}
1557
1559 const AddrMode &AM, Type *Ty,
1560 unsigned AS, Instruction *I) const {
1561 // No global is ever allowed as a base.
1562 if (AM.BaseGV)
1563 return false;
1564
1565 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1566 return isLegalGlobalAddressingMode(AM);
1567
1568 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1572 // If the offset isn't a multiple of 4, it probably isn't going to be
1573 // correctly aligned.
1574 // FIXME: Can we get the real alignment here?
1575 if (AM.BaseOffs % 4 != 0)
1576 return isLegalMUBUFAddressingMode(AM);
1577
1578 if (!Subtarget->hasScalarSubwordLoads()) {
1579 // There are no SMRD extloads, so if we have to do a small type access we
1580 // will use a MUBUF load.
1581 // FIXME?: We also need to do this if unaligned, but we don't know the
1582 // alignment here.
1583 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1584 return isLegalGlobalAddressingMode(AM);
1585 }
1586
1588 // SMRD instructions have an 8-bit, dword offset on SI.
1589 if (!isUInt<8>(AM.BaseOffs / 4))
1590 return false;
1591 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1592 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1593 // in 8-bits, it can use a smaller encoding.
1594 if (!isUInt<32>(AM.BaseOffs / 4))
1595 return false;
1596 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1597 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1598 if (!isUInt<20>(AM.BaseOffs))
1599 return false;
1600 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1601 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1602 // for S_BUFFER_* instructions).
1603 if (!isInt<21>(AM.BaseOffs))
1604 return false;
1605 } else {
1606 // On GFX12, all offsets are signed 24-bit in bytes.
1607 if (!isInt<24>(AM.BaseOffs))
1608 return false;
1609 }
1610
1611 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1612 return true;
1613
1614 if (AM.Scale == 1 && AM.HasBaseReg)
1615 return true;
1616
1617 return false;
1618 }
1619
1620 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1621 return Subtarget->enableFlatScratch()
1622 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1624 : isLegalMUBUFAddressingMode(AM);
1625
1626 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1627 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1628 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1629 // field.
1630 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1631 // an 8-bit dword offset but we don't know the alignment here.
1632 if (!isUInt<16>(AM.BaseOffs))
1633 return false;
1634
1635 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1636 return true;
1637
1638 if (AM.Scale == 1 && AM.HasBaseReg)
1639 return true;
1640
1641 return false;
1642 }
1643
1645 // For an unknown address space, this usually means that this is for some
1646 // reason being used for pure arithmetic, and not based on some addressing
1647 // computation. We don't have instructions that compute pointers with any
1648 // addressing modes, so treat them as having no offset like flat
1649 // instructions.
1650 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1652 }
1653
1654 // Assume a user alias of global for unknown address spaces.
1655 return isLegalGlobalAddressingMode(AM);
1656}
1657
1659 const MachineFunction &MF) const {
1661 return (MemVT.getSizeInBits() <= 4 * 32);
1662 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1663 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1664 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1665 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1666 return (MemVT.getSizeInBits() <= 2 * 32);
1667 }
1668 return true;
1669}
1670
1672 unsigned Size, unsigned AddrSpace, Align Alignment,
1673 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1674 if (IsFast)
1675 *IsFast = 0;
1676
1677 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1678 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1679 // Check if alignment requirements for ds_read/write instructions are
1680 // disabled.
1681 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1682 return false;
1683
1684 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1685 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1686 Alignment < RequiredAlignment)
1687 return false;
1688
1689 // Either, the alignment requirements are "enabled", or there is an
1690 // unaligned LDS access related hardware bug though alignment requirements
1691 // are "disabled". In either case, we need to check for proper alignment
1692 // requirements.
1693 //
1694 switch (Size) {
1695 case 64:
1696 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1697 // address is negative, then the instruction is incorrectly treated as
1698 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1699 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1700 // load later in the SILoadStoreOptimizer.
1701 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1702 return false;
1703
1704 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1705 // can do a 4 byte aligned, 8 byte access in a single operation using
1706 // ds_read2/write2_b32 with adjacent offsets.
1707 RequiredAlignment = Align(4);
1708
1709 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1710 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1711 // ds_write2_b32 depending on the alignment. In either case with either
1712 // alignment there is no faster way of doing this.
1713
1714 // The numbers returned here and below are not additive, it is a 'speed
1715 // rank'. They are just meant to be compared to decide if a certain way
1716 // of lowering an operation is faster than another. For that purpose
1717 // naturally aligned operation gets it bitsize to indicate that "it
1718 // operates with a speed comparable to N-bit wide load". With the full
1719 // alignment ds128 is slower than ds96 for example. If underaligned it
1720 // is comparable to a speed of a single dword access, which would then
1721 // mean 32 < 128 and it is faster to issue a wide load regardless.
1722 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1723 // wider load which will not be aligned anymore the latter is slower.
1724 if (IsFast)
1725 *IsFast = (Alignment >= RequiredAlignment) ? 64
1726 : (Alignment < Align(4)) ? 32
1727 : 1;
1728 return true;
1729 }
1730
1731 break;
1732 case 96:
1733 if (!Subtarget->hasDS96AndDS128())
1734 return false;
1735
1736 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1737 // gfx8 and older.
1738
1739 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1740 // Naturally aligned access is fastest. However, also report it is Fast
1741 // if memory is aligned less than DWORD. A narrow load or store will be
1742 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1743 // be more of them, so overall we will pay less penalty issuing a single
1744 // instruction.
1745
1746 // See comment on the values above.
1747 if (IsFast)
1748 *IsFast = (Alignment >= RequiredAlignment) ? 96
1749 : (Alignment < Align(4)) ? 32
1750 : 1;
1751 return true;
1752 }
1753
1754 break;
1755 case 128:
1756 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1757 return false;
1758
1759 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1760 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1761 // single operation using ds_read2/write2_b64.
1762 RequiredAlignment = Align(8);
1763
1764 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1765 // Naturally aligned access is fastest. However, also report it is Fast
1766 // if memory is aligned less than DWORD. A narrow load or store will be
1767 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1768 // will be more of them, so overall we will pay less penalty issuing a
1769 // single instruction.
1770
1771 // See comment on the values above.
1772 if (IsFast)
1773 *IsFast = (Alignment >= RequiredAlignment) ? 128
1774 : (Alignment < Align(4)) ? 32
1775 : 1;
1776 return true;
1777 }
1778
1779 break;
1780 default:
1781 if (Size > 32)
1782 return false;
1783
1784 break;
1785 }
1786
1787 // See comment on the values above.
1788 // Note that we have a single-dword or sub-dword here, so if underaligned
1789 // it is a slowest possible access, hence returned value is 0.
1790 if (IsFast)
1791 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1792
1793 return Alignment >= RequiredAlignment ||
1794 Subtarget->hasUnalignedDSAccessEnabled();
1795 }
1796
1797 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1798 bool AlignedBy4 = Alignment >= Align(4);
1799 if (IsFast)
1800 *IsFast = AlignedBy4;
1801
1802 return AlignedBy4 ||
1803 Subtarget->enableFlatScratch() ||
1804 Subtarget->hasUnalignedScratchAccess();
1805 }
1806
1807 // FIXME: We have to be conservative here and assume that flat operations
1808 // will access scratch. If we had access to the IR function, then we
1809 // could determine if any private memory was used in the function.
1810 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1811 !Subtarget->hasUnalignedScratchAccess()) {
1812 bool AlignedBy4 = Alignment >= Align(4);
1813 if (IsFast)
1814 *IsFast = AlignedBy4;
1815
1816 return AlignedBy4;
1817 }
1818
1819 // So long as they are correct, wide global memory operations perform better
1820 // than multiple smaller memory ops -- even when misaligned
1821 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1822 if (IsFast)
1823 *IsFast = Size;
1824
1825 return Alignment >= Align(4) ||
1827 }
1828
1829 // Smaller than dword value must be aligned.
1830 if (Size < 32)
1831 return false;
1832
1833 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1834 // byte-address are ignored, thus forcing Dword alignment.
1835 // This applies to private, global, and constant memory.
1836 if (IsFast)
1837 *IsFast = 1;
1838
1839 return Size >= 32 && Alignment >= Align(4);
1840}
1841
1843 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1844 unsigned *IsFast) const {
1846 Alignment, Flags, IsFast);
1847}
1848
1850 const MemOp &Op, const AttributeList &FuncAttributes) const {
1851 // FIXME: Should account for address space here.
1852
1853 // The default fallback uses the private pointer size as a guess for a type to
1854 // use. Make sure we switch these to 64-bit accesses.
1855
1856 if (Op.size() >= 16 &&
1857 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1858 return MVT::v4i32;
1859
1860 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1861 return MVT::v2i32;
1862
1863 // Use the default.
1864 return MVT::Other;
1865}
1866
1868 const MemSDNode *MemNode = cast<MemSDNode>(N);
1869 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1870}
1871
1873 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1875}
1876
1878 unsigned DestAS) const {
1879 // Flat -> private/local is a simple truncate.
1880 // Flat -> global is no-op
1881 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1882 return true;
1883
1884 const GCNTargetMachine &TM =
1885 static_cast<const GCNTargetMachine &>(getTargetMachine());
1886 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1887}
1888
1890 const MemSDNode *MemNode = cast<MemSDNode>(N);
1891
1893}
1894
1897 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1898 VT.getScalarType().bitsLE(MVT::i16))
1901}
1902
1904 Type *Ty) const {
1905 // FIXME: Could be smarter if called for vector constants.
1906 return true;
1907}
1908
1910 unsigned Index) const {
1912 return false;
1913
1914 // TODO: Add more cases that are cheap.
1915 return Index == 0;
1916}
1917
1919 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1920 switch (Op) {
1921 case ISD::LOAD:
1922 case ISD::STORE:
1923
1924 // These operations are done with 32-bit instructions anyway.
1925 case ISD::AND:
1926 case ISD::OR:
1927 case ISD::XOR:
1928 case ISD::SELECT:
1929 // TODO: Extensions?
1930 return true;
1931 default:
1932 return false;
1933 }
1934 }
1935
1936 // SimplifySetCC uses this function to determine whether or not it should
1937 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1938 if (VT == MVT::i1 && Op == ISD::SETCC)
1939 return false;
1940
1942}
1943
1944SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1945 const SDLoc &SL,
1946 SDValue Chain,
1947 uint64_t Offset) const {
1948 const DataLayout &DL = DAG.getDataLayout();
1951
1952 const ArgDescriptor *InputPtrReg;
1953 const TargetRegisterClass *RC;
1954 LLT ArgTy;
1956
1957 std::tie(InputPtrReg, RC, ArgTy) =
1959
1960 // We may not have the kernarg segment argument if we have no kernel
1961 // arguments.
1962 if (!InputPtrReg)
1963 return DAG.getConstant(Offset, SL, PtrVT);
1964
1966 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1967 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1968
1969 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1970}
1971
1972SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1973 const SDLoc &SL) const {
1976 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1977}
1978
1979SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1980 const SDLoc &SL) const {
1981
1983 std::optional<uint32_t> KnownSize =
1985 if (KnownSize.has_value())
1986 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1987 return SDValue();
1988}
1989
1990SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1991 const SDLoc &SL, SDValue Val,
1992 bool Signed,
1993 const ISD::InputArg *Arg) const {
1994 // First, if it is a widened vector, narrow it.
1995 if (VT.isVector() &&
1997 EVT NarrowedVT =
2000 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2001 DAG.getConstant(0, SL, MVT::i32));
2002 }
2003
2004 // Then convert the vector elements or scalar value.
2005 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2006 VT.bitsLT(MemVT)) {
2007 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2008 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2009 }
2010
2011 if (MemVT.isFloatingPoint())
2012 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2013 else if (Signed)
2014 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2015 else
2016 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2017
2018 return Val;
2019}
2020
2021SDValue SITargetLowering::lowerKernargMemParameter(
2022 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2023 uint64_t Offset, Align Alignment, bool Signed,
2024 const ISD::InputArg *Arg) const {
2026
2027 // Try to avoid using an extload by loading earlier than the argument address,
2028 // and extracting the relevant bits. The load should hopefully be merged with
2029 // the previous argument.
2030 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2031 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2032 int64_t AlignDownOffset = alignDown(Offset, 4);
2033 int64_t OffsetDiff = Offset - AlignDownOffset;
2034
2035 EVT IntVT = MemVT.changeTypeToInteger();
2036
2037 // TODO: If we passed in the base kernel offset we could have a better
2038 // alignment than 4, but we don't really need it.
2039 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2040 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2043
2044 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2045 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2046
2047 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2048 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2049 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2050
2051
2052 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2053 }
2054
2055 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2056 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2059
2060 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2061 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2062}
2063
2064SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2065 const SDLoc &SL, SDValue Chain,
2066 const ISD::InputArg &Arg) const {
2068 MachineFrameInfo &MFI = MF.getFrameInfo();
2069
2070 if (Arg.Flags.isByVal()) {
2071 unsigned Size = Arg.Flags.getByValSize();
2072 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2073 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2074 }
2075
2076 unsigned ArgOffset = VA.getLocMemOffset();
2077 unsigned ArgSize = VA.getValVT().getStoreSize();
2078
2079 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2080
2081 // Create load nodes to retrieve arguments from the stack.
2082 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2083 SDValue ArgValue;
2084
2085 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2087 MVT MemVT = VA.getValVT();
2088
2089 switch (VA.getLocInfo()) {
2090 default:
2091 break;
2092 case CCValAssign::BCvt:
2093 MemVT = VA.getLocVT();
2094 break;
2095 case CCValAssign::SExt:
2096 ExtType = ISD::SEXTLOAD;
2097 break;
2098 case CCValAssign::ZExt:
2099 ExtType = ISD::ZEXTLOAD;
2100 break;
2101 case CCValAssign::AExt:
2102 ExtType = ISD::EXTLOAD;
2103 break;
2104 }
2105
2106 ArgValue = DAG.getExtLoad(
2107 ExtType, SL, VA.getLocVT(), Chain, FIN,
2109 MemVT);
2110 return ArgValue;
2111}
2112
2113SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2114 const SIMachineFunctionInfo &MFI,
2115 EVT VT,
2117 const ArgDescriptor *Reg = nullptr;
2118 const TargetRegisterClass *RC;
2119 LLT Ty;
2120
2122 const ArgDescriptor WorkGroupIDX =
2123 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2124 // If GridZ is not programmed in an entry function then the hardware will set
2125 // it to all zeros, so there is no need to mask the GridY value in the low
2126 // order bits.
2127 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2128 AMDGPU::TTMP7,
2129 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2130 const ArgDescriptor WorkGroupIDZ =
2131 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2132 if (Subtarget->hasArchitectedSGPRs() &&
2134 switch (PVID) {
2136 Reg = &WorkGroupIDX;
2137 RC = &AMDGPU::SReg_32RegClass;
2138 Ty = LLT::scalar(32);
2139 break;
2141 Reg = &WorkGroupIDY;
2142 RC = &AMDGPU::SReg_32RegClass;
2143 Ty = LLT::scalar(32);
2144 break;
2146 Reg = &WorkGroupIDZ;
2147 RC = &AMDGPU::SReg_32RegClass;
2148 Ty = LLT::scalar(32);
2149 break;
2150 default:
2151 break;
2152 }
2153 }
2154
2155 if (!Reg)
2156 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2157 if (!Reg) {
2159 // It's possible for a kernarg intrinsic call to appear in a kernel with
2160 // no allocated segment, in which case we do not add the user sgpr
2161 // argument, so just return null.
2162 return DAG.getConstant(0, SDLoc(), VT);
2163 }
2164
2165 // It's undefined behavior if a function marked with the amdgpu-no-*
2166 // attributes uses the corresponding intrinsic.
2167 return DAG.getUNDEF(VT);
2168 }
2169
2170 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2171}
2172
2174 CallingConv::ID CallConv,
2175 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2176 FunctionType *FType,
2177 SIMachineFunctionInfo *Info) {
2178 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2179 const ISD::InputArg *Arg = &Ins[I];
2180
2181 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2182 "vector type argument should have been split");
2183
2184 // First check if it's a PS input addr.
2185 if (CallConv == CallingConv::AMDGPU_PS &&
2186 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2187 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2188
2189 // Inconveniently only the first part of the split is marked as isSplit,
2190 // so skip to the end. We only want to increment PSInputNum once for the
2191 // entire split argument.
2192 if (Arg->Flags.isSplit()) {
2193 while (!Arg->Flags.isSplitEnd()) {
2194 assert((!Arg->VT.isVector() ||
2195 Arg->VT.getScalarSizeInBits() == 16) &&
2196 "unexpected vector split in ps argument type");
2197 if (!SkipArg)
2198 Splits.push_back(*Arg);
2199 Arg = &Ins[++I];
2200 }
2201 }
2202
2203 if (SkipArg) {
2204 // We can safely skip PS inputs.
2205 Skipped.set(Arg->getOrigArgIndex());
2206 ++PSInputNum;
2207 continue;
2208 }
2209
2210 Info->markPSInputAllocated(PSInputNum);
2211 if (Arg->Used)
2212 Info->markPSInputEnabled(PSInputNum);
2213
2214 ++PSInputNum;
2215 }
2216
2217 Splits.push_back(*Arg);
2218 }
2219}
2220
2221// Allocate special inputs passed in VGPRs.
2223 MachineFunction &MF,
2224 const SIRegisterInfo &TRI,
2225 SIMachineFunctionInfo &Info) const {
2226 const LLT S32 = LLT::scalar(32);
2228
2229 if (Info.hasWorkItemIDX()) {
2230 Register Reg = AMDGPU::VGPR0;
2231 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2232
2233 CCInfo.AllocateReg(Reg);
2234 unsigned Mask = (Subtarget->hasPackedTID() &&
2235 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2236 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2237 }
2238
2239 if (Info.hasWorkItemIDY()) {
2240 assert(Info.hasWorkItemIDX());
2241 if (Subtarget->hasPackedTID()) {
2242 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2243 0x3ff << 10));
2244 } else {
2245 unsigned Reg = AMDGPU::VGPR1;
2246 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2247
2248 CCInfo.AllocateReg(Reg);
2249 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2250 }
2251 }
2252
2253 if (Info.hasWorkItemIDZ()) {
2254 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2255 if (Subtarget->hasPackedTID()) {
2256 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2257 0x3ff << 20));
2258 } else {
2259 unsigned Reg = AMDGPU::VGPR2;
2260 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2261
2262 CCInfo.AllocateReg(Reg);
2263 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2264 }
2265 }
2266}
2267
2268// Try to allocate a VGPR at the end of the argument list, or if no argument
2269// VGPRs are left allocating a stack slot.
2270// If \p Mask is is given it indicates bitfield position in the register.
2271// If \p Arg is given use it with new ]p Mask instead of allocating new.
2272static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2273 ArgDescriptor Arg = ArgDescriptor()) {
2274 if (Arg.isSet())
2275 return ArgDescriptor::createArg(Arg, Mask);
2276
2277 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2278 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2279 if (RegIdx == ArgVGPRs.size()) {
2280 // Spill to stack required.
2281 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2282
2283 return ArgDescriptor::createStack(Offset, Mask);
2284 }
2285
2286 unsigned Reg = ArgVGPRs[RegIdx];
2287 Reg = CCInfo.AllocateReg(Reg);
2288 assert(Reg != AMDGPU::NoRegister);
2289
2290 MachineFunction &MF = CCInfo.getMachineFunction();
2291 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2292 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2293 return ArgDescriptor::createRegister(Reg, Mask);
2294}
2295
2297 const TargetRegisterClass *RC,
2298 unsigned NumArgRegs) {
2299 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2300 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2301 if (RegIdx == ArgSGPRs.size())
2302 report_fatal_error("ran out of SGPRs for arguments");
2303
2304 unsigned Reg = ArgSGPRs[RegIdx];
2305 Reg = CCInfo.AllocateReg(Reg);
2306 assert(Reg != AMDGPU::NoRegister);
2307
2308 MachineFunction &MF = CCInfo.getMachineFunction();
2309 MF.addLiveIn(Reg, RC);
2311}
2312
2313// If this has a fixed position, we still should allocate the register in the
2314// CCInfo state. Technically we could get away with this for values passed
2315// outside of the normal argument range.
2317 const TargetRegisterClass *RC,
2318 MCRegister Reg) {
2319 Reg = CCInfo.AllocateReg(Reg);
2320 assert(Reg != AMDGPU::NoRegister);
2321 MachineFunction &MF = CCInfo.getMachineFunction();
2322 MF.addLiveIn(Reg, RC);
2323}
2324
2325static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2326 if (Arg) {
2327 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2328 Arg.getRegister());
2329 } else
2330 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2331}
2332
2333static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2334 if (Arg) {
2335 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2336 Arg.getRegister());
2337 } else
2338 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2339}
2340
2341/// Allocate implicit function VGPR arguments at the end of allocated user
2342/// arguments.
2344 CCState &CCInfo, MachineFunction &MF,
2345 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2346 const unsigned Mask = 0x3ff;
2347 ArgDescriptor Arg;
2348
2349 if (Info.hasWorkItemIDX()) {
2350 Arg = allocateVGPR32Input(CCInfo, Mask);
2351 Info.setWorkItemIDX(Arg);
2352 }
2353
2354 if (Info.hasWorkItemIDY()) {
2355 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2356 Info.setWorkItemIDY(Arg);
2357 }
2358
2359 if (Info.hasWorkItemIDZ())
2360 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2361}
2362
2363/// Allocate implicit function VGPR arguments in fixed registers.
2365 CCState &CCInfo, MachineFunction &MF,
2366 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2367 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2368 if (!Reg)
2369 report_fatal_error("failed to allocated VGPR for implicit arguments");
2370
2371 const unsigned Mask = 0x3ff;
2372 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2373 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2374 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2375}
2376
2378 CCState &CCInfo,
2379 MachineFunction &MF,
2380 const SIRegisterInfo &TRI,
2381 SIMachineFunctionInfo &Info) const {
2382 auto &ArgInfo = Info.getArgInfo();
2383 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2384
2385 // TODO: Unify handling with private memory pointers.
2386 if (UserSGPRInfo.hasDispatchPtr())
2387 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2388
2389 const Module *M = MF.getFunction().getParent();
2390 if (UserSGPRInfo.hasQueuePtr() &&
2392 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2393
2394 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2395 // constant offset from the kernarg segment.
2396 if (Info.hasImplicitArgPtr())
2397 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2398
2399 if (UserSGPRInfo.hasDispatchID())
2400 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2401
2402 // flat_scratch_init is not applicable for non-kernel functions.
2403
2404 if (Info.hasWorkGroupIDX())
2405 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2406
2407 if (Info.hasWorkGroupIDY())
2408 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2409
2410 if (Info.hasWorkGroupIDZ())
2411 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2412
2413 if (Info.hasLDSKernelId())
2414 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2415}
2416
2417// Allocate special inputs passed in user SGPRs.
2419 MachineFunction &MF,
2420 const SIRegisterInfo &TRI,
2421 SIMachineFunctionInfo &Info) const {
2422 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2423 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2424 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2425 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2426 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2427 }
2428
2429 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2430 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2431 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2432 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2433 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2434 }
2435
2436 if (UserSGPRInfo.hasDispatchPtr()) {
2437 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2438 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2439 CCInfo.AllocateReg(DispatchPtrReg);
2440 }
2441
2442 const Module *M = MF.getFunction().getParent();
2443 if (UserSGPRInfo.hasQueuePtr() &&
2445 Register QueuePtrReg = Info.addQueuePtr(TRI);
2446 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2447 CCInfo.AllocateReg(QueuePtrReg);
2448 }
2449
2450 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2452 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2453 CCInfo.AllocateReg(InputPtrReg);
2454
2455 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2456 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2457 }
2458
2459 if (UserSGPRInfo.hasDispatchID()) {
2460 Register DispatchIDReg = Info.addDispatchID(TRI);
2461 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2462 CCInfo.AllocateReg(DispatchIDReg);
2463 }
2464
2465 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2466 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2467 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2468 CCInfo.AllocateReg(FlatScratchInitReg);
2469 }
2470
2471 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2472 // these from the dispatch pointer.
2473}
2474
2475// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2476// sequential starting from the first argument.
2478 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2480 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2481 Function &F = MF.getFunction();
2482 unsigned LastExplicitArgOffset =
2483 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2484 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2485 bool InPreloadSequence = true;
2486 unsigned InIdx = 0;
2487 for (auto &Arg : F.args()) {
2488 if (!InPreloadSequence || !Arg.hasInRegAttr())
2489 break;
2490
2491 int ArgIdx = Arg.getArgNo();
2492 // Don't preload non-original args or parts not in the current preload
2493 // sequence.
2494 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2495 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2496 break;
2497
2498 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2499 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2500 InIdx++) {
2501 assert(ArgLocs[ArgIdx].isMemLoc());
2502 auto &ArgLoc = ArgLocs[InIdx];
2503 const Align KernelArgBaseAlign = Align(16);
2504 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2505 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2506 unsigned NumAllocSGPRs =
2507 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2508
2509 // Arg is preloaded into the previous SGPR.
2510 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2511 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2512 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2513 continue;
2514 }
2515
2516 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2517 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2518 // Check for free user SGPRs for preloading.
2519 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2520 SGPRInfo.getNumFreeUserSGPRs()) {
2521 InPreloadSequence = false;
2522 break;
2523 }
2524
2525 // Preload this argument.
2526 const TargetRegisterClass *RC =
2527 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2528 SmallVectorImpl<MCRegister> *PreloadRegs =
2529 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2530
2531 if (PreloadRegs->size() > 1)
2532 RC = &AMDGPU::SGPR_32RegClass;
2533 for (auto &Reg : *PreloadRegs) {
2534 assert(Reg);
2535 MF.addLiveIn(Reg, RC);
2536 CCInfo.AllocateReg(Reg);
2537 }
2538
2539 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2540 }
2541 }
2542}
2543
2545 const SIRegisterInfo &TRI,
2546 SIMachineFunctionInfo &Info) const {
2547 // Always allocate this last since it is a synthetic preload.
2548 if (Info.hasLDSKernelId()) {
2549 Register Reg = Info.addLDSKernelId();
2550 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2551 CCInfo.AllocateReg(Reg);
2552 }
2553}
2554
2555// Allocate special input registers that are initialized per-wave.
2557 MachineFunction &MF,
2559 CallingConv::ID CallConv,
2560 bool IsShader) const {
2561 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2562 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2563 // Note: user SGPRs are handled by the front-end for graphics shaders
2564 // Pad up the used user SGPRs with dead inputs.
2565
2566 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2567 // before enabling architected SGPRs for workgroup IDs.
2568 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2569
2570 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2571 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2572 // rely on it to reach 16 since if we end up having no stack usage, it will
2573 // not really be added.
2574 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2575 Info.hasWorkGroupIDY() +
2576 Info.hasWorkGroupIDZ() +
2577 Info.hasWorkGroupInfo();
2578 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2579 Register Reg = Info.addReservedUserSGPR();
2580 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2581 CCInfo.AllocateReg(Reg);
2582 }
2583 }
2584
2585 if (!HasArchitectedSGPRs) {
2586 if (Info.hasWorkGroupIDX()) {
2587 Register Reg = Info.addWorkGroupIDX();
2588 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2589 CCInfo.AllocateReg(Reg);
2590 }
2591
2592 if (Info.hasWorkGroupIDY()) {
2593 Register Reg = Info.addWorkGroupIDY();
2594 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2595 CCInfo.AllocateReg(Reg);
2596 }
2597
2598 if (Info.hasWorkGroupIDZ()) {
2599 Register Reg = Info.addWorkGroupIDZ();
2600 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2601 CCInfo.AllocateReg(Reg);
2602 }
2603 }
2604
2605 if (Info.hasWorkGroupInfo()) {
2606 Register Reg = Info.addWorkGroupInfo();
2607 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2608 CCInfo.AllocateReg(Reg);
2609 }
2610
2611 if (Info.hasPrivateSegmentWaveByteOffset()) {
2612 // Scratch wave offset passed in system SGPR.
2613 unsigned PrivateSegmentWaveByteOffsetReg;
2614
2615 if (IsShader) {
2616 PrivateSegmentWaveByteOffsetReg =
2617 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2618
2619 // This is true if the scratch wave byte offset doesn't have a fixed
2620 // location.
2621 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2622 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2623 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2624 }
2625 } else
2626 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2627
2628 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2629 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2630 }
2631
2632 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2633 Info.getNumPreloadedSGPRs() >= 16);
2634}
2635
2637 MachineFunction &MF,
2638 const SIRegisterInfo &TRI,
2639 SIMachineFunctionInfo &Info) {
2640 // Now that we've figured out where the scratch register inputs are, see if
2641 // should reserve the arguments and use them directly.
2642 MachineFrameInfo &MFI = MF.getFrameInfo();
2643 bool HasStackObjects = MFI.hasStackObjects();
2644 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2645
2646 // Record that we know we have non-spill stack objects so we don't need to
2647 // check all stack objects later.
2648 if (HasStackObjects)
2649 Info.setHasNonSpillStackObjects(true);
2650
2651 // Everything live out of a block is spilled with fast regalloc, so it's
2652 // almost certain that spilling will be required.
2653 if (TM.getOptLevel() == CodeGenOptLevel::None)
2654 HasStackObjects = true;
2655
2656 // For now assume stack access is needed in any callee functions, so we need
2657 // the scratch registers to pass in.
2658 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2659
2660 if (!ST.enableFlatScratch()) {
2661 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2662 // If we have stack objects, we unquestionably need the private buffer
2663 // resource. For the Code Object V2 ABI, this will be the first 4 user
2664 // SGPR inputs. We can reserve those and use them directly.
2665
2666 Register PrivateSegmentBufferReg =
2668 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2669 } else {
2670 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2671 // We tentatively reserve the last registers (skipping the last registers
2672 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2673 // we'll replace these with the ones immediately after those which were
2674 // really allocated. In the prologue copies will be inserted from the
2675 // argument to these reserved registers.
2676
2677 // Without HSA, relocations are used for the scratch pointer and the
2678 // buffer resource setup is always inserted in the prologue. Scratch wave
2679 // offset is still in an input SGPR.
2680 Info.setScratchRSrcReg(ReservedBufferReg);
2681 }
2682 }
2683
2685
2686 // For entry functions we have to set up the stack pointer if we use it,
2687 // whereas non-entry functions get this "for free". This means there is no
2688 // intrinsic advantage to using S32 over S34 in cases where we do not have
2689 // calls but do need a frame pointer (i.e. if we are requested to have one
2690 // because frame pointer elimination is disabled). To keep things simple we
2691 // only ever use S32 as the call ABI stack pointer, and so using it does not
2692 // imply we need a separate frame pointer.
2693 //
2694 // Try to use s32 as the SP, but move it if it would interfere with input
2695 // arguments. This won't work with calls though.
2696 //
2697 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2698 // registers.
2699 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2700 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2701 } else {
2703
2704 if (MFI.hasCalls())
2705 report_fatal_error("call in graphics shader with too many input SGPRs");
2706
2707 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2708 if (!MRI.isLiveIn(Reg)) {
2709 Info.setStackPtrOffsetReg(Reg);
2710 break;
2711 }
2712 }
2713
2714 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2715 report_fatal_error("failed to find register for SP");
2716 }
2717
2718 // hasFP should be accurate for entry functions even before the frame is
2719 // finalized, because it does not rely on the known stack size, only
2720 // properties like whether variable sized objects are present.
2721 if (ST.getFrameLowering()->hasFP(MF)) {
2722 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2723 }
2724}
2725
2728 return !Info->isEntryFunction();
2729}
2730
2732
2733}
2734
2736 MachineBasicBlock *Entry,
2737 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2739
2740 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2741 if (!IStart)
2742 return;
2743
2744 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2745 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2746 MachineBasicBlock::iterator MBBI = Entry->begin();
2747 for (const MCPhysReg *I = IStart; *I; ++I) {
2748 const TargetRegisterClass *RC = nullptr;
2749 if (AMDGPU::SReg_64RegClass.contains(*I))
2750 RC = &AMDGPU::SGPR_64RegClass;
2751 else if (AMDGPU::SReg_32RegClass.contains(*I))
2752 RC = &AMDGPU::SGPR_32RegClass;
2753 else
2754 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2755
2756 Register NewVR = MRI->createVirtualRegister(RC);
2757 // Create copy from CSR to a virtual register.
2758 Entry->addLiveIn(*I);
2759 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2760 .addReg(*I);
2761
2762 // Insert the copy-back instructions right before the terminator.
2763 for (auto *Exit : Exits)
2764 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2765 TII->get(TargetOpcode::COPY), *I)
2766 .addReg(NewVR);
2767 }
2768}
2769
2771 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2772 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2773 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2775
2777 const Function &Fn = MF.getFunction();
2780
2781 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2782 DiagnosticInfoUnsupported NoGraphicsHSA(
2783 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2784 DAG.getContext()->diagnose(NoGraphicsHSA);
2785 return DAG.getEntryNode();
2786 }
2787
2790 BitVector Skipped(Ins.size());
2791 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2792 *DAG.getContext());
2793
2794 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2795 bool IsKernel = AMDGPU::isKernel(CallConv);
2796 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2797
2798 if (IsGraphics) {
2799 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2800 assert(!UserSGPRInfo.hasDispatchPtr() &&
2801 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2802 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2803 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2804 (void)UserSGPRInfo;
2805 if (!Subtarget->enableFlatScratch())
2806 assert(!UserSGPRInfo.hasFlatScratchInit());
2807 if ((CallConv != CallingConv::AMDGPU_CS &&
2808 CallConv != CallingConv::AMDGPU_Gfx) ||
2809 !Subtarget->hasArchitectedSGPRs())
2810 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2811 !Info->hasWorkGroupIDZ());
2812 }
2813
2814 if (CallConv == CallingConv::AMDGPU_PS) {
2815 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2816
2817 // At least one interpolation mode must be enabled or else the GPU will
2818 // hang.
2819 //
2820 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2821 // set PSInputAddr, the user wants to enable some bits after the compilation
2822 // based on run-time states. Since we can't know what the final PSInputEna
2823 // will look like, so we shouldn't do anything here and the user should take
2824 // responsibility for the correct programming.
2825 //
2826 // Otherwise, the following restrictions apply:
2827 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2828 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2829 // enabled too.
2830 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2831 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2832 CCInfo.AllocateReg(AMDGPU::VGPR0);
2833 CCInfo.AllocateReg(AMDGPU::VGPR1);
2834 Info->markPSInputAllocated(0);
2835 Info->markPSInputEnabled(0);
2836 }
2837 if (Subtarget->isAmdPalOS()) {
2838 // For isAmdPalOS, the user does not enable some bits after compilation
2839 // based on run-time states; the register values being generated here are
2840 // the final ones set in hardware. Therefore we need to apply the
2841 // workaround to PSInputAddr and PSInputEnable together. (The case where
2842 // a bit is set in PSInputAddr but not PSInputEnable is where the
2843 // frontend set up an input arg for a particular interpolation mode, but
2844 // nothing uses that input arg. Really we should have an earlier pass
2845 // that removes such an arg.)
2846 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2847 if ((PsInputBits & 0x7F) == 0 ||
2848 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2849 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2850 }
2851 } else if (IsKernel) {
2852 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2853 } else {
2854 Splits.append(Ins.begin(), Ins.end());
2855 }
2856
2857 if (IsKernel)
2858 analyzeFormalArgumentsCompute(CCInfo, Ins);
2859
2860 if (IsEntryFunc) {
2861 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2862 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2863 if (IsKernel && Subtarget->hasKernargPreload())
2864 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2865
2866 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2867 } else if (!IsGraphics) {
2868 // For the fixed ABI, pass workitem IDs in the last argument register.
2869 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2870
2871 // FIXME: Sink this into allocateSpecialInputSGPRs
2872 if (!Subtarget->enableFlatScratch())
2873 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2874
2875 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2876 }
2877
2878 if (!IsKernel) {
2879 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2880 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2881 }
2882
2884
2885 // FIXME: This is the minimum kernel argument alignment. We should improve
2886 // this to the maximum alignment of the arguments.
2887 //
2888 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2889 // kern arg offset.
2890 const Align KernelArgBaseAlign = Align(16);
2891
2892 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2893 const ISD::InputArg &Arg = Ins[i];
2894 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2895 InVals.push_back(DAG.getUNDEF(Arg.VT));
2896 continue;
2897 }
2898
2899 CCValAssign &VA = ArgLocs[ArgIdx++];
2900 MVT VT = VA.getLocVT();
2901
2902 if (IsEntryFunc && VA.isMemLoc()) {
2903 VT = Ins[i].VT;
2904 EVT MemVT = VA.getLocVT();
2905
2906 const uint64_t Offset = VA.getLocMemOffset();
2907 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2908
2909 if (Arg.Flags.isByRef()) {
2910 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2911
2912 const GCNTargetMachine &TM =
2913 static_cast<const GCNTargetMachine &>(getTargetMachine());
2914 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2915 Arg.Flags.getPointerAddrSpace())) {
2918 }
2919
2920 InVals.push_back(Ptr);
2921 continue;
2922 }
2923
2924 SDValue NewArg;
2925 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2926 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2927 // In this case the argument is packed into the previous preload SGPR.
2928 int64_t AlignDownOffset = alignDown(Offset, 4);
2929 int64_t OffsetDiff = Offset - AlignDownOffset;
2930 EVT IntVT = MemVT.changeTypeToInteger();
2931
2935 Register Reg =
2936 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2937
2938 assert(Reg);
2939 Register VReg = MRI.getLiveInVirtReg(Reg);
2940 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2941
2942 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2943 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2944
2945 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2946 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2947 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2948 Ins[i].Flags.isSExt(), &Ins[i]);
2949
2950 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2951 } else {
2955 const SmallVectorImpl<MCRegister> &PreloadRegs =
2956 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2957
2958 SDValue Copy;
2959 if (PreloadRegs.size() == 1) {
2960 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2961 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2962 NewArg = DAG.getCopyFromReg(
2963 Chain, DL, VReg,
2965 TRI->getRegSizeInBits(*RC)));
2966
2967 } else {
2968 // If the kernarg alignment does not match the alignment of the SGPR
2969 // tuple RC that can accommodate this argument, it will be built up
2970 // via copies from from the individual SGPRs that the argument was
2971 // preloaded to.
2973 for (auto Reg : PreloadRegs) {
2974 Register VReg = MRI.getLiveInVirtReg(Reg);
2975 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2976 Elts.push_back(Copy);
2977 }
2978 NewArg =
2979 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2980 PreloadRegs.size()),
2981 DL, Elts);
2982 }
2983
2984 // If the argument was preloaded to multiple consecutive 32-bit
2985 // registers because of misalignment between addressable SGPR tuples
2986 // and the argument size, we can still assume that because of kernarg
2987 // segment alignment restrictions that NewArg's size is the same as
2988 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
2989 // truncate since we cannot preload to less than a single SGPR and the
2990 // MemVT may be smaller.
2991 EVT MemVTInt =
2993 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
2994 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
2995
2996 NewArg = DAG.getBitcast(MemVT, NewArg);
2997 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
2998 Ins[i].Flags.isSExt(), &Ins[i]);
2999 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3000 }
3001 } else {
3002 NewArg =
3003 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3004 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3005 }
3006 Chains.push_back(NewArg.getValue(1));
3007
3008 auto *ParamTy =
3009 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3011 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3012 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3013 // On SI local pointers are just offsets into LDS, so they are always
3014 // less than 16-bits. On CI and newer they could potentially be
3015 // real pointers, so we can't guarantee their size.
3016 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3017 DAG.getValueType(MVT::i16));
3018 }
3019
3020 InVals.push_back(NewArg);
3021 continue;
3022 } else if (!IsEntryFunc && VA.isMemLoc()) {
3023 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3024 InVals.push_back(Val);
3025 if (!Arg.Flags.isByVal())
3026 Chains.push_back(Val.getValue(1));
3027 continue;
3028 }
3029
3030 assert(VA.isRegLoc() && "Parameter must be in a register!");
3031
3032 Register Reg = VA.getLocReg();
3033 const TargetRegisterClass *RC = nullptr;
3034 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3035 RC = &AMDGPU::VGPR_32RegClass;
3036 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3037 RC = &AMDGPU::SGPR_32RegClass;
3038 else
3039 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3040 EVT ValVT = VA.getValVT();
3041
3042 Reg = MF.addLiveIn(Reg, RC);
3043 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3044
3045 if (Arg.Flags.isSRet()) {
3046 // The return object should be reasonably addressable.
3047
3048 // FIXME: This helps when the return is a real sret. If it is a
3049 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3050 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3051 unsigned NumBits
3053 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3054 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3055 }
3056
3057 // If this is an 8 or 16-bit value, it is really passed promoted
3058 // to 32 bits. Insert an assert[sz]ext to capture this, then
3059 // truncate to the right size.
3060 switch (VA.getLocInfo()) {
3061 case CCValAssign::Full:
3062 break;
3063 case CCValAssign::BCvt:
3064 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3065 break;
3066 case CCValAssign::SExt:
3067 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3068 DAG.getValueType(ValVT));
3069 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3070 break;
3071 case CCValAssign::ZExt:
3072 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3073 DAG.getValueType(ValVT));
3074 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3075 break;
3076 case CCValAssign::AExt:
3077 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3078 break;
3079 default:
3080 llvm_unreachable("Unknown loc info!");
3081 }
3082
3083 InVals.push_back(Val);
3084 }
3085
3086 // Start adding system SGPRs.
3087 if (IsEntryFunc)
3088 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3089
3090 auto &ArgUsageInfo =
3092 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3093
3094 unsigned StackArgSize = CCInfo.getStackSize();
3095 Info->setBytesInStackArgArea(StackArgSize);
3096
3097 return Chains.empty() ? Chain :
3098 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3099}
3100
3101// TODO: If return values can't fit in registers, we should return as many as
3102// possible in registers before passing on stack.
3104 CallingConv::ID CallConv,
3105 MachineFunction &MF, bool IsVarArg,
3107 LLVMContext &Context) const {
3108 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3109 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3110 // for shaders. Vector types should be explicitly handled by CC.
3111 if (AMDGPU::isEntryFunctionCC(CallConv))
3112 return true;
3113
3115 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3116 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3117 return false;
3118
3119 // We must use the stack if return would require unavailable registers.
3120 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3121 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3122 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3123 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3124 return false;
3125
3126 return true;
3127}
3128
3129SDValue
3131 bool isVarArg,
3133 const SmallVectorImpl<SDValue> &OutVals,
3134 const SDLoc &DL, SelectionDAG &DAG) const {
3137
3138 if (AMDGPU::isKernel(CallConv)) {
3139 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3140 OutVals, DL, DAG);
3141 }
3142
3143 bool IsShader = AMDGPU::isShader(CallConv);
3144
3145 Info->setIfReturnsVoid(Outs.empty());
3146 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3147
3148 // CCValAssign - represent the assignment of the return value to a location.
3151
3152 // CCState - Info about the registers and stack slots.
3153 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3154 *DAG.getContext());
3155
3156 // Analyze outgoing return values.
3157 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3158
3159 SDValue Glue;
3161 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3162
3163 // Copy the result values into the output registers.
3164 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3165 ++I, ++RealRVLocIdx) {
3166 CCValAssign &VA = RVLocs[I];
3167 assert(VA.isRegLoc() && "Can only return in registers!");
3168 // TODO: Partially return in registers if return values don't fit.
3169 SDValue Arg = OutVals[RealRVLocIdx];
3170
3171 // Copied from other backends.
3172 switch (VA.getLocInfo()) {
3173 case CCValAssign::Full:
3174 break;
3175 case CCValAssign::BCvt:
3176 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3177 break;
3178 case CCValAssign::SExt:
3179 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3180 break;
3181 case CCValAssign::ZExt:
3182 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3183 break;
3184 case CCValAssign::AExt:
3185 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3186 break;
3187 default:
3188 llvm_unreachable("Unknown loc info!");
3189 }
3190
3191 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3192 Glue = Chain.getValue(1);
3193 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3194 }
3195
3196 // FIXME: Does sret work properly?
3197 if (!Info->isEntryFunction()) {
3198 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3199 const MCPhysReg *I =
3200 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3201 if (I) {
3202 for (; *I; ++I) {
3203 if (AMDGPU::SReg_64RegClass.contains(*I))
3204 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3205 else if (AMDGPU::SReg_32RegClass.contains(*I))
3206 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3207 else
3208 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3209 }
3210 }
3211 }
3212
3213 // Update chain and glue.
3214 RetOps[0] = Chain;
3215 if (Glue.getNode())
3216 RetOps.push_back(Glue);
3217
3218 unsigned Opc = AMDGPUISD::ENDPGM;
3219 if (!IsWaveEnd)
3221 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3222}
3223
3225 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3226 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3227 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3228 SDValue ThisVal) const {
3229 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3230
3231 // Assign locations to each value returned by this call.
3233 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3234 *DAG.getContext());
3235 CCInfo.AnalyzeCallResult(Ins, RetCC);
3236
3237 // Copy all of the result registers out of their specified physreg.
3238 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3239 CCValAssign VA = RVLocs[i];
3240 SDValue Val;
3241
3242 if (VA.isRegLoc()) {
3243 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3244 Chain = Val.getValue(1);
3245 InGlue = Val.getValue(2);
3246 } else if (VA.isMemLoc()) {
3247 report_fatal_error("TODO: return values in memory");
3248 } else
3249 llvm_unreachable("unknown argument location type");
3250
3251 switch (VA.getLocInfo()) {
3252 case CCValAssign::Full:
3253 break;
3254 case CCValAssign::BCvt:
3255 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3256 break;
3257 case CCValAssign::ZExt:
3258 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3259 DAG.getValueType(VA.getValVT()));
3260 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3261 break;
3262 case CCValAssign::SExt:
3263 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3264 DAG.getValueType(VA.getValVT()));
3265 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3266 break;
3267 case CCValAssign::AExt:
3268 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3269 break;
3270 default:
3271 llvm_unreachable("Unknown loc info!");
3272 }
3273
3274 InVals.push_back(Val);
3275 }
3276
3277 return Chain;
3278}
3279
3280// Add code to pass special inputs required depending on used features separate
3281// from the explicit user arguments present in the IR.
3283 CallLoweringInfo &CLI,
3284 CCState &CCInfo,
3285 const SIMachineFunctionInfo &Info,
3286 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3287 SmallVectorImpl<SDValue> &MemOpChains,
3288 SDValue Chain) const {
3289 // If we don't have a call site, this was a call inserted by
3290 // legalization. These can never use special inputs.
3291 if (!CLI.CB)
3292 return;
3293
3294 SelectionDAG &DAG = CLI.DAG;
3295 const SDLoc &DL = CLI.DL;
3296 const Function &F = DAG.getMachineFunction().getFunction();
3297
3298 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3299 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3300
3301 const AMDGPUFunctionArgInfo *CalleeArgInfo
3303 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3304 auto &ArgUsageInfo =
3306 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3307 }
3308
3309 // TODO: Unify with private memory register handling. This is complicated by
3310 // the fact that at least in kernels, the input argument is not necessarily
3311 // in the same location as the input.
3312 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3314 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3315 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3316 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3317 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3318 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3319 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3320 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3321 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3322 };
3323
3324 for (auto Attr : ImplicitAttrs) {
3325 const ArgDescriptor *OutgoingArg;
3326 const TargetRegisterClass *ArgRC;
3327 LLT ArgTy;
3328
3329 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3330
3331 // If the callee does not use the attribute value, skip copying the value.
3332 if (CLI.CB->hasFnAttr(Attr.second))
3333 continue;
3334
3335 std::tie(OutgoingArg, ArgRC, ArgTy) =
3336 CalleeArgInfo->getPreloadedValue(InputID);
3337 if (!OutgoingArg)
3338 continue;
3339
3340 const ArgDescriptor *IncomingArg;
3341 const TargetRegisterClass *IncomingArgRC;
3342 LLT Ty;
3343 std::tie(IncomingArg, IncomingArgRC, Ty) =
3344 CallerArgInfo.getPreloadedValue(InputID);
3345 assert(IncomingArgRC == ArgRC);
3346
3347 // All special arguments are ints for now.
3348 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3349 SDValue InputReg;
3350
3351 if (IncomingArg) {
3352 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3353 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3354 // The implicit arg ptr is special because it doesn't have a corresponding
3355 // input for kernels, and is computed from the kernarg segment pointer.
3356 InputReg = getImplicitArgPtr(DAG, DL);
3357 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3358 std::optional<uint32_t> Id =
3360 if (Id.has_value()) {
3361 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3362 } else {
3363 InputReg = DAG.getUNDEF(ArgVT);
3364 }
3365 } else {
3366 // We may have proven the input wasn't needed, although the ABI is
3367 // requiring it. We just need to allocate the register appropriately.
3368 InputReg = DAG.getUNDEF(ArgVT);
3369 }
3370
3371 if (OutgoingArg->isRegister()) {
3372 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3373 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3374 report_fatal_error("failed to allocate implicit input argument");
3375 } else {
3376 unsigned SpecialArgOffset =
3377 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3378 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3379 SpecialArgOffset);
3380 MemOpChains.push_back(ArgStore);
3381 }
3382 }
3383
3384 // Pack workitem IDs into a single register or pass it as is if already
3385 // packed.
3386 const ArgDescriptor *OutgoingArg;
3387 const TargetRegisterClass *ArgRC;
3388 LLT Ty;
3389
3390 std::tie(OutgoingArg, ArgRC, Ty) =
3392 if (!OutgoingArg)
3393 std::tie(OutgoingArg, ArgRC, Ty) =
3395 if (!OutgoingArg)
3396 std::tie(OutgoingArg, ArgRC, Ty) =
3398 if (!OutgoingArg)
3399 return;
3400
3401 const ArgDescriptor *IncomingArgX = std::get<0>(
3403 const ArgDescriptor *IncomingArgY = std::get<0>(
3405 const ArgDescriptor *IncomingArgZ = std::get<0>(
3407
3408 SDValue InputReg;
3409 SDLoc SL;
3410
3411 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3412 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3413 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3414
3415 // If incoming ids are not packed we need to pack them.
3416 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3417 NeedWorkItemIDX) {
3418 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3419 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3420 } else {
3421 InputReg = DAG.getConstant(0, DL, MVT::i32);
3422 }
3423 }
3424
3425 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3426 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3427 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3428 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3429 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3430 InputReg = InputReg.getNode() ?
3431 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3432 }
3433
3434 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3435 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3436 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3437 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3438 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3439 InputReg = InputReg.getNode() ?
3440 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3441 }
3442
3443 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3444 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3445 // We're in a situation where the outgoing function requires the workitem
3446 // ID, but the calling function does not have it (e.g a graphics function
3447 // calling a C calling convention function). This is illegal, but we need
3448 // to produce something.
3449 InputReg = DAG.getUNDEF(MVT::i32);
3450 } else {
3451 // Workitem ids are already packed, any of present incoming arguments
3452 // will carry all required fields.
3454 IncomingArgX ? *IncomingArgX :
3455 IncomingArgY ? *IncomingArgY :
3456 *IncomingArgZ, ~0u);
3457 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3458 }
3459 }
3460
3461 if (OutgoingArg->isRegister()) {
3462 if (InputReg)
3463 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3464
3465 CCInfo.AllocateReg(OutgoingArg->getRegister());
3466 } else {
3467 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3468 if (InputReg) {
3469 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3470 SpecialArgOffset);
3471 MemOpChains.push_back(ArgStore);
3472 }
3473 }
3474}
3475
3477 return CC == CallingConv::Fast;
3478}
3479
3480/// Return true if we might ever do TCO for calls with this calling convention.
3482 switch (CC) {
3483 case CallingConv::C:
3485 return true;
3486 default:
3487 return canGuaranteeTCO(CC);
3488 }
3489}
3490
3492 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3494 const SmallVectorImpl<SDValue> &OutVals,
3495 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3496 if (AMDGPU::isChainCC(CalleeCC))
3497 return true;
3498
3499 if (!mayTailCallThisCC(CalleeCC))
3500 return false;
3501
3502 // For a divergent call target, we need to do a waterfall loop over the
3503 // possible callees which precludes us from using a simple jump.
3504 if (Callee->isDivergent())
3505 return false;
3506
3508 const Function &CallerF = MF.getFunction();
3509 CallingConv::ID CallerCC = CallerF.getCallingConv();
3511 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3512
3513 // Kernels aren't callable, and don't have a live in return address so it
3514 // doesn't make sense to do a tail call with entry functions.
3515 if (!CallerPreserved)
3516 return false;
3517
3518 bool CCMatch = CallerCC == CalleeCC;
3519
3521 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3522 return true;
3523 return false;
3524 }
3525
3526 // TODO: Can we handle var args?
3527 if (IsVarArg)
3528 return false;
3529
3530 for (const Argument &Arg : CallerF.args()) {
3531 if (Arg.hasByValAttr())
3532 return false;
3533 }
3534
3535 LLVMContext &Ctx = *DAG.getContext();
3536
3537 // Check that the call results are passed in the same way.
3538 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3539 CCAssignFnForCall(CalleeCC, IsVarArg),
3540 CCAssignFnForCall(CallerCC, IsVarArg)))
3541 return false;
3542
3543 // The callee has to preserve all registers the caller needs to preserve.
3544 if (!CCMatch) {
3545 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3546 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3547 return false;
3548 }
3549
3550 // Nothing more to check if the callee is taking no arguments.
3551 if (Outs.empty())
3552 return true;
3553
3555 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3556
3557 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3558
3559 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3560 // If the stack arguments for this call do not fit into our own save area then
3561 // the call cannot be made tail.
3562 // TODO: Is this really necessary?
3563 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3564 return false;
3565
3566 const MachineRegisterInfo &MRI = MF.getRegInfo();
3567 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3568}
3569
3571 if (!CI->isTailCall())
3572 return false;
3573
3574 const Function *ParentFn = CI->getParent()->getParent();
3576 return false;
3577 return true;
3578}
3579
3580// The wave scratch offset register is used as the global base pointer.
3582 SmallVectorImpl<SDValue> &InVals) const {
3583 CallingConv::ID CallConv = CLI.CallConv;
3584 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3585
3586 SelectionDAG &DAG = CLI.DAG;
3587
3588 TargetLowering::ArgListEntry RequestedExec;
3589 if (IsChainCallConv) {
3590 // The last argument should be the value that we need to put in EXEC.
3591 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3592 // don't treat it like the rest of the arguments.
3593 RequestedExec = CLI.Args.back();
3594 assert(RequestedExec.Node && "No node for EXEC");
3595
3596 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3597 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3598
3599 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3600 CLI.Outs.pop_back();
3601 CLI.OutVals.pop_back();
3602
3603 if (RequestedExec.Ty->isIntegerTy(64)) {
3604 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3605 CLI.Outs.pop_back();
3606 CLI.OutVals.pop_back();
3607 }
3608
3609 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3610 "Haven't popped all the pieces of the EXEC mask");
3611 }
3612
3613 const SDLoc &DL = CLI.DL;
3615 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3617 SDValue Chain = CLI.Chain;
3618 SDValue Callee = CLI.Callee;
3619 bool &IsTailCall = CLI.IsTailCall;
3620 bool IsVarArg = CLI.IsVarArg;
3621 bool IsSibCall = false;
3623
3624 if (Callee.isUndef() || isNullConstant(Callee)) {
3625 if (!CLI.IsTailCall) {
3626 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3627 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3628 }
3629
3630 return Chain;
3631 }
3632
3633 if (IsVarArg) {
3634 return lowerUnhandledCall(CLI, InVals,
3635 "unsupported call to variadic function ");
3636 }
3637
3638 if (!CLI.CB)
3639 report_fatal_error("unsupported libcall legalization");
3640
3641 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3642 return lowerUnhandledCall(CLI, InVals,
3643 "unsupported required tail call to function ");
3644 }
3645
3646 if (IsTailCall) {
3648 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3649 if (!IsTailCall &&
3650 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3651 report_fatal_error("failed to perform tail call elimination on a call "
3652 "site marked musttail or on llvm.amdgcn.cs.chain");
3653 }
3654
3655 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3656
3657 // A sibling call is one where we're under the usual C ABI and not planning
3658 // to change that but can still do a tail call:
3659 if (!TailCallOpt && IsTailCall)
3660 IsSibCall = true;
3661
3662 if (IsTailCall)
3663 ++NumTailCalls;
3664 }
3665
3668 SmallVector<SDValue, 8> MemOpChains;
3669
3670 // Analyze operands of the call, assigning locations to each operand.
3672 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3673 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3674
3675 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3676 // With a fixed ABI, allocate fixed registers before user arguments.
3677 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3678 }
3679
3680 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3681
3682 // Get a count of how many bytes are to be pushed on the stack.
3683 unsigned NumBytes = CCInfo.getStackSize();
3684
3685 if (IsSibCall) {
3686 // Since we're not changing the ABI to make this a tail call, the memory
3687 // operands are already available in the caller's incoming argument space.
3688 NumBytes = 0;
3689 }
3690
3691 // FPDiff is the byte offset of the call's argument area from the callee's.
3692 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3693 // by this amount for a tail call. In a sibling call it must be 0 because the
3694 // caller will deallocate the entire stack and the callee still expects its
3695 // arguments to begin at SP+0. Completely unused for non-tail calls.
3696 int32_t FPDiff = 0;
3697 MachineFrameInfo &MFI = MF.getFrameInfo();
3698
3699 // Adjust the stack pointer for the new arguments...
3700 // These operations are automatically eliminated by the prolog/epilog pass
3701 if (!IsSibCall)
3702 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3703
3704 if (!IsSibCall || IsChainCallConv) {
3705 if (!Subtarget->enableFlatScratch()) {
3706 SmallVector<SDValue, 4> CopyFromChains;
3707
3708 // In the HSA case, this should be an identity copy.
3709 SDValue ScratchRSrcReg
3710 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3711 RegsToPass.emplace_back(IsChainCallConv
3712 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3713 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3714 ScratchRSrcReg);
3715 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3716 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3717 }
3718 }
3719
3720 MVT PtrVT = MVT::i32;
3721
3722 // Walk the register/memloc assignments, inserting copies/loads.
3723 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3724 CCValAssign &VA = ArgLocs[i];
3725 SDValue Arg = OutVals[i];
3726
3727 // Promote the value if needed.
3728 switch (VA.getLocInfo()) {
3729 case CCValAssign::Full:
3730 break;
3731 case CCValAssign::BCvt:
3732 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3733 break;
3734 case CCValAssign::ZExt:
3735 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3736 break;
3737 case CCValAssign::SExt:
3738 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3739 break;
3740 case CCValAssign::AExt:
3741 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3742 break;
3743 case CCValAssign::FPExt:
3744 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3745 break;
3746 default:
3747 llvm_unreachable("Unknown loc info!");
3748 }
3749
3750 if (VA.isRegLoc()) {
3751 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3752 } else {
3753 assert(VA.isMemLoc());
3754
3755 SDValue DstAddr;
3756 MachinePointerInfo DstInfo;
3757
3758 unsigned LocMemOffset = VA.getLocMemOffset();
3759 int32_t Offset = LocMemOffset;
3760
3761 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3762 MaybeAlign Alignment;
3763
3764 if (IsTailCall) {
3765 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3766 unsigned OpSize = Flags.isByVal() ?
3767 Flags.getByValSize() : VA.getValVT().getStoreSize();
3768
3769 // FIXME: We can have better than the minimum byval required alignment.
3770 Alignment =
3771 Flags.isByVal()
3772 ? Flags.getNonZeroByValAlign()
3773 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3774
3775 Offset = Offset + FPDiff;
3776 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3777
3778 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3779 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3780
3781 // Make sure any stack arguments overlapping with where we're storing
3782 // are loaded before this eventual operation. Otherwise they'll be
3783 // clobbered.
3784
3785 // FIXME: Why is this really necessary? This seems to just result in a
3786 // lot of code to copy the stack and write them back to the same
3787 // locations, which are supposed to be immutable?
3788 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3789 } else {
3790 // Stores to the argument stack area are relative to the stack pointer.
3791 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3792 MVT::i32);
3793 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3794 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3795 Alignment =
3796 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3797 }
3798
3799 if (Outs[i].Flags.isByVal()) {
3800 SDValue SizeNode =
3801 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3802 SDValue Cpy =
3803 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3804 Outs[i].Flags.getNonZeroByValAlign(),
3805 /*isVol = */ false, /*AlwaysInline = */ true,
3806 /*isTailCall = */ false, DstInfo,
3808
3809 MemOpChains.push_back(Cpy);
3810 } else {
3811 SDValue Store =
3812 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3813 MemOpChains.push_back(Store);
3814 }
3815 }
3816 }
3817
3818 if (!MemOpChains.empty())
3819 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3820
3821 // Build a sequence of copy-to-reg nodes chained together with token chain
3822 // and flag operands which copy the outgoing args into the appropriate regs.
3823 SDValue InGlue;
3824 for (auto &RegToPass : RegsToPass) {
3825 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3826 RegToPass.second, InGlue);
3827 InGlue = Chain.getValue(1);
3828 }
3829
3830
3831 // We don't usually want to end the call-sequence here because we would tidy
3832 // the frame up *after* the call, however in the ABI-changing tail-call case
3833 // we've carefully laid out the parameters so that when sp is reset they'll be
3834 // in the correct location.
3835 if (IsTailCall && !IsSibCall) {
3836 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3837 InGlue = Chain.getValue(1);
3838 }
3839
3840 std::vector<SDValue> Ops;
3841 Ops.push_back(Chain);
3842 Ops.push_back(Callee);
3843 // Add a redundant copy of the callee global which will not be legalized, as
3844 // we need direct access to the callee later.
3845 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3846 const GlobalValue *GV = GSD->getGlobal();
3847 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3848 } else {
3849 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3850 }
3851
3852 if (IsTailCall) {
3853 // Each tail call may have to adjust the stack by a different amount, so
3854 // this information must travel along with the operation for eventual
3855 // consumption by emitEpilogue.
3856 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3857 }
3858
3859 if (IsChainCallConv)
3860 Ops.push_back(RequestedExec.Node);
3861
3862 // Add argument registers to the end of the list so that they are known live
3863 // into the call.
3864 for (auto &RegToPass : RegsToPass) {
3865 Ops.push_back(DAG.getRegister(RegToPass.first,
3866 RegToPass.second.getValueType()));
3867 }
3868
3869 // Add a register mask operand representing the call-preserved registers.
3870 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3871 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3872 assert(Mask && "Missing call preserved mask for calling convention");
3873 Ops.push_back(DAG.getRegisterMask(Mask));
3874
3875 if (SDValue Token = CLI.ConvergenceControlToken) {
3877 GlueOps.push_back(Token);
3878 if (InGlue)
3879 GlueOps.push_back(InGlue);
3880
3881 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3882 MVT::Glue, GlueOps),
3883 0);
3884 }
3885
3886 if (InGlue)
3887 Ops.push_back(InGlue);
3888
3889 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3890
3891 // If we're doing a tall call, use a TC_RETURN here rather than an
3892 // actual call instruction.
3893 if (IsTailCall) {
3894 MFI.setHasTailCall();
3895 unsigned OPC = AMDGPUISD::TC_RETURN;
3896 switch (CallConv) {
3899 break;
3903 break;
3904 }
3905
3906 return DAG.getNode(OPC, DL, NodeTys, Ops);
3907 }
3908
3909 // Returns a chain and a flag for retval copy to use.
3910 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3911 Chain = Call.getValue(0);
3912 InGlue = Call.getValue(1);
3913
3914 uint64_t CalleePopBytes = NumBytes;
3915 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3916 if (!Ins.empty())
3917 InGlue = Chain.getValue(1);
3918
3919 // Handle result values, copying them out of physregs into vregs that we
3920 // return.
3921 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3922 InVals, /*IsThisReturn=*/false, SDValue());
3923}
3924
3925// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3926// except for applying the wave size scale to the increment amount.
3928 SDValue Op, SelectionDAG &DAG) const {
3929 const MachineFunction &MF = DAG.getMachineFunction();
3931
3932 SDLoc dl(Op);
3933 EVT VT = Op.getValueType();
3934 SDValue Tmp1 = Op;
3935 SDValue Tmp2 = Op.getValue(1);
3936 SDValue Tmp3 = Op.getOperand(2);
3937 SDValue Chain = Tmp1.getOperand(0);
3938
3939 Register SPReg = Info->getStackPtrOffsetReg();
3940
3941 // Chain the dynamic stack allocation so that it doesn't modify the stack
3942 // pointer when other instructions are using the stack.
3943 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3944
3945 SDValue Size = Tmp2.getOperand(1);
3946 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3947 Chain = SP.getValue(1);
3948 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3949 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3950 unsigned Opc =
3953
3954 SDValue ScaledSize = DAG.getNode(
3955 ISD::SHL, dl, VT, Size,
3956 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3957
3958 Align StackAlign = TFL->getStackAlign();
3959 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3960 if (Alignment && *Alignment > StackAlign) {
3961 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3962 DAG.getConstant(-(uint64_t)Alignment->value()
3963 << Subtarget->getWavefrontSizeLog2(),
3964 dl, VT));
3965 }
3966
3967 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3968 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3969
3970 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3971}
3972
3974 SelectionDAG &DAG) const {
3975 // We only handle constant sizes here to allow non-entry block, static sized
3976 // allocas. A truly dynamic value is more difficult to support because we
3977 // don't know if the size value is uniform or not. If the size isn't uniform,
3978 // we would need to do a wave reduction to get the maximum size to know how
3979 // much to increment the uniform stack pointer.
3980 SDValue Size = Op.getOperand(1);
3981 if (isa<ConstantSDNode>(Size))
3982 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3983
3985}
3986
3988 if (Op.getValueType() != MVT::i32)
3989 return Op; // Defer to cannot select error.
3990
3992 SDLoc SL(Op);
3993
3994 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3995
3996 // Convert from wave uniform to swizzled vector address. This should protect
3997 // from any edge cases where the stacksave result isn't directly used with
3998 // stackrestore.
3999 SDValue VectorAddress =
4000 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4001 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4002}
4003
4005 SelectionDAG &DAG) const {
4006 SDLoc SL(Op);
4007 assert(Op.getValueType() == MVT::i32);
4008
4009 uint32_t BothRoundHwReg =
4011 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4012
4013 SDValue IntrinID =
4014 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4015 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4016 Op.getOperand(0), IntrinID, GetRoundBothImm);
4017
4018 // There are two rounding modes, one for f32 and one for f64/f16. We only
4019 // report in the standard value range if both are the same.
4020 //
4021 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4022 // ties away from zero is not supported, and the other values are rotated by
4023 // 1.
4024 //
4025 // If the two rounding modes are not the same, report a target defined value.
4026
4027 // Mode register rounding mode fields:
4028 //
4029 // [1:0] Single-precision round mode.
4030 // [3:2] Double/Half-precision round mode.
4031 //
4032 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4033 //
4034 // Hardware Spec
4035 // Toward-0 3 0
4036 // Nearest Even 0 1
4037 // +Inf 1 2
4038 // -Inf 2 3
4039 // NearestAway0 N/A 4
4040 //
4041 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4042 // table we can index by the raw hardware mode.
4043 //
4044 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4045
4046 SDValue BitTable =
4048
4049 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4050 SDValue RoundModeTimesNumBits =
4051 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4052
4053 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4054 // knew only one mode was demanded.
4055 SDValue TableValue =
4056 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4057 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4058
4059 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4060 SDValue TableEntry =
4061 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4062
4063 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4064 // if it's an extended value.
4065 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4066 SDValue IsStandardValue =
4067 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4068 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4069 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4070 TableEntry, EnumOffset);
4071
4072 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4073}
4074
4076 SelectionDAG &DAG) const {
4077 SDLoc SL(Op);
4078
4079 SDValue NewMode = Op.getOperand(1);
4080 assert(NewMode.getValueType() == MVT::i32);
4081
4082 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4083 // hardware MODE.fp_round values.
4084 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4085 uint32_t ClampedVal = std::min(
4086 static_cast<uint32_t>(ConstMode->getZExtValue()),
4088 NewMode = DAG.getConstant(
4089 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4090 } else {
4091 // If we know the input can only be one of the supported standard modes in
4092 // the range 0-3, we can use a simplified mapping to hardware values.
4093 KnownBits KB = DAG.computeKnownBits(NewMode);
4094 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4095 // The supported standard values are 0-3. The extended values start at 8. We
4096 // need to offset by 4 if the value is in the extended range.
4097
4098 if (UseReducedTable) {
4099 // Truncate to the low 32-bits.
4100 SDValue BitTable = DAG.getConstant(
4101 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4102
4103 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4104 SDValue RoundModeTimesNumBits =
4105 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4106
4107 NewMode =
4108 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4109
4110 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4111 // the table extracted bits into inline immediates.
4112 } else {
4113 // table_index = umin(value, value - 4)
4114 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4115 SDValue BitTable =
4117
4118 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4119 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4120 SDValue IndexVal =
4121 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4122
4123 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4124 SDValue RoundModeTimesNumBits =
4125 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4126
4127 SDValue TableValue =
4128 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4129 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4130
4131 // No need to mask out the high bits since the setreg will ignore them
4132 // anyway.
4133 NewMode = TruncTable;
4134 }
4135
4136 // Insert a readfirstlane in case the value is a VGPR. We could do this
4137 // earlier and keep more operations scalar, but that interferes with
4138 // combining the source.
4139 SDValue ReadFirstLaneID =
4140 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4141 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4142 ReadFirstLaneID, NewMode);
4143 }
4144
4145 // N.B. The setreg will be later folded into s_round_mode on supported
4146 // targets.
4147 SDValue IntrinID =
4148 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4149 uint32_t BothRoundHwReg =
4151 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4152
4153 SDValue SetReg =
4154 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4155 IntrinID, RoundBothImm, NewMode);
4156
4157 return SetReg;
4158}
4159
4161 if (Op->isDivergent())
4162 return SDValue();
4163
4164 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4169 break;
4170 default:
4171 return SDValue();
4172 }
4173
4174 return Op;
4175}
4176
4177// Work around DAG legality rules only based on the result type.
4179 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4180 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4181 EVT SrcVT = Src.getValueType();
4182
4183 if (SrcVT.getScalarType() != MVT::bf16)
4184 return Op;
4185
4186 SDLoc SL(Op);
4187 SDValue BitCast =
4188 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4189
4190 EVT DstVT = Op.getValueType();
4191 if (IsStrict)
4192 llvm_unreachable("Need STRICT_BF16_TO_FP");
4193
4194 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4195}
4196
4198 SDLoc SL(Op);
4199 if (Op.getValueType() != MVT::i64)
4200 return Op;
4201
4202 uint32_t ModeHwReg =
4204 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4205 uint32_t TrapHwReg =
4207 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4208
4209 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4210 SDValue IntrinID =
4211 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4212 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4213 Op.getOperand(0), IntrinID, ModeHwRegImm);
4214 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4215 Op.getOperand(0), IntrinID, TrapHwRegImm);
4216 SDValue TokenReg =
4217 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4218 GetTrapReg.getValue(1));
4219
4220 SDValue CvtPtr =
4221 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4222 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4223
4224 return DAG.getMergeValues({Result, TokenReg}, SL);
4225}
4226
4228 SDLoc SL(Op);
4229 if (Op.getOperand(1).getValueType() != MVT::i64)
4230 return Op;
4231
4232 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4233 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4234 DAG.getConstant(0, SL, MVT::i32));
4235 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4236 DAG.getConstant(1, SL, MVT::i32));
4237
4238 SDValue ReadFirstLaneID =
4239 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4240 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4241 ReadFirstLaneID, NewModeReg);
4242 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4243 ReadFirstLaneID, NewTrapReg);
4244
4245 unsigned ModeHwReg =
4247 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4248 unsigned TrapHwReg =
4250 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4251
4252 SDValue IntrinID =
4253 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4254 SDValue SetModeReg =
4255 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4256 IntrinID, ModeHwRegImm, NewModeReg);
4257 SDValue SetTrapReg =
4258 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4259 IntrinID, TrapHwRegImm, NewTrapReg);
4260 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4261}
4262
4264 const MachineFunction &MF) const {
4266 .Case("m0", AMDGPU::M0)
4267 .Case("exec", AMDGPU::EXEC)
4268 .Case("exec_lo", AMDGPU::EXEC_LO)
4269 .Case("exec_hi", AMDGPU::EXEC_HI)
4270 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4271 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4272 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4273 .Default(Register());
4274
4275 if (Reg == AMDGPU::NoRegister) {
4276 report_fatal_error(Twine("invalid register name \""
4277 + StringRef(RegName) + "\"."));
4278
4279 }
4280
4281 if (!Subtarget->hasFlatScrRegister() &&
4282 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4283 report_fatal_error(Twine("invalid register \""
4284 + StringRef(RegName) + "\" for subtarget."));
4285 }
4286
4287 switch (Reg) {
4288 case AMDGPU::M0:
4289 case AMDGPU::EXEC_LO:
4290 case AMDGPU::EXEC_HI:
4291 case AMDGPU::FLAT_SCR_LO:
4292 case AMDGPU::FLAT_SCR_HI:
4293 if (VT.getSizeInBits() == 32)
4294 return Reg;
4295 break;
4296 case AMDGPU::EXEC:
4297 case AMDGPU::FLAT_SCR:
4298 if (VT.getSizeInBits() == 64)
4299 return Reg;
4300 break;
4301 default:
4302 llvm_unreachable("missing register type checking");
4303 }
4304
4305 report_fatal_error(Twine("invalid type for register \""
4306 + StringRef(RegName) + "\"."));
4307}
4308
4309// If kill is not the last instruction, split the block so kill is always a
4310// proper terminator.
4313 MachineBasicBlock *BB) const {
4314 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4316 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4317 return SplitBB;
4318}
4319
4320// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4321// \p MI will be the only instruction in the loop body block. Otherwise, it will
4322// be the first instruction in the remainder block.
4323//
4324/// \returns { LoopBody, Remainder }
4325static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4328