LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 if (Subtarget->hasPkMovB32()) {
426 // TODO: 16-bit element vectors should be legal with even aligned elements.
427 // TODO: Can be legal with wider source types than the result with
428 // subregister extracts.
429 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
430 }
431
432 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
433 Custom);
434
435 // Avoid stack access for these.
436 // TODO: Generalize to more vector types.
438 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
439 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
440 Custom);
441
442 // Deal with vec3 vector operations when widened to vec4.
444 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
445
446 // Deal with vec5/6/7 vector operations when widened to vec8.
448 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
449 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
450 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
451 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
452 Custom);
453
454 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
455 // and output demarshalling
456 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
457
458 // We can't return success/failure, only the old value,
459 // let LLVM add the comparison
461 Expand);
462
463 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
464
465 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
466
467 // FIXME: This should be narrowed to i32, but that only happens if i64 is
468 // illegal.
469 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
470 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
471
472 // On SI this is s_memtime and s_memrealtime on VI.
474
475 if (Subtarget->hasSMemRealTime() ||
479
480 if (Subtarget->has16BitInsts()) {
483 } else {
485 }
486
487 if (Subtarget->hasMadMacF32Insts())
489
490 if (!Subtarget->hasBFI())
491 // fcopysign can be done in a single instruction with BFI.
492 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
493
494 if (!Subtarget->hasBCNT(32))
496
497 if (!Subtarget->hasBCNT(64))
499
500 if (Subtarget->hasFFBH())
502
503 if (Subtarget->hasFFBL())
505
506 // We only really have 32-bit BFE instructions (and 16-bit on VI).
507 //
508 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
509 // effort to match them now. We want this to be false for i64 cases when the
510 // extraction isn't restricted to the upper or lower half. Ideally we would
511 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
512 // span the midpoint are probably relatively rare, so don't worry about them
513 // for now.
514 if (Subtarget->hasBFE())
516
517 // Clamp modifier on add/sub
518 if (Subtarget->hasIntClamp())
520
521 if (Subtarget->hasAddNoCarry())
522 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
523 Legal);
524
525 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
526 Custom);
527
528 // These are really only legal for ieee_mode functions. We should be avoiding
529 // them for functions that don't have ieee_mode enabled, so just say they are
530 // legal.
532 {MVT::f32, MVT::f64}, Legal);
533
534 if (Subtarget->haveRoundOpsF64())
536 Legal);
537 else
539 MVT::f64, Custom);
540
542 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
543 Legal);
544 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
545
548
549 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
550 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
551
552 // Custom lower these because we can't specify a rule based on an illegal
553 // source bf16.
556
557 if (Subtarget->has16BitInsts()) {
560 MVT::i16, Legal);
561
562 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
563
565 MVT::i16, Expand);
566
570 ISD::CTPOP},
571 MVT::i16, Promote);
572
574
575 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
576
578 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
580 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
581
585
587
588 // F16 - Constant Actions.
591
592 // F16 - Load/Store Actions.
594 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
596 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
597
598 // BF16 - Load/Store Actions.
600 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
602 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
603
604 // F16 - VOP1 Actions.
607 MVT::f16, Custom);
608
611
612 // F16 - VOP2 Actions.
613 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
614 Expand);
618
619 // F16 - VOP3 Actions.
621 if (STI.hasMadF16())
623
624 for (MVT VT :
625 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
626 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
627 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
628 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
629 switch (Op) {
630 case ISD::LOAD:
631 case ISD::STORE:
633 case ISD::BITCAST:
634 case ISD::UNDEF:
639 case ISD::IS_FPCLASS:
640 break;
644 break;
645 default:
647 break;
648 }
649 }
650 }
651
652 // v_perm_b32 can handle either of these.
653 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
655
656 // XXX - Do these do anything? Vector constants turn into build_vector.
657 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
658
659 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
660 Legal);
661
663 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
665 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
666
668 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
670 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
671
672 setOperationAction(ISD::AND, MVT::v2i16, Promote);
673 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
674 setOperationAction(ISD::OR, MVT::v2i16, Promote);
675 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
676 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
677 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
678
680 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
683 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
684 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
689 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
691 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
692
694 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
696 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
697 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
698 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
699
701 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
703 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
704
706 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
708 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
710 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
711
712 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
714 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
718
720 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
722 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
723 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
724 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
725
726 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
727 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
728 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
732
734 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
736 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
737 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
738 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
739
741 MVT::v2i32, Expand);
743
745 MVT::v4i32, Expand);
746
748 MVT::v8i32, Expand);
749
750 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
751 Subtarget->hasVOP3PInsts() ? Legal : Custom);
752
753 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
754 // This isn't really legal, but this avoids the legalizer unrolling it (and
755 // allows matching fneg (fabs x) patterns)
756 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
757
760
763 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 Custom);
765
767 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
768 Expand);
769
770 for (MVT Vec16 :
771 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
772 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
775 Vec16, Custom);
777 }
778 }
779
780 if (Subtarget->hasVOP3PInsts()) {
784 MVT::v2i16, Legal);
785
788 MVT::v2f16, Legal);
789
791 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
792
794 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
795 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
796 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
797 Custom);
798
799 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
800 // Split vector operations.
805 VT, Custom);
806
807 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
808 // Split vector operations.
810 VT, Custom);
811
812 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
813 Custom);
814
815 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
816 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
817 Custom);
818
819 if (Subtarget->hasPackedFP32Ops()) {
821 MVT::v2f32, Legal);
823 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
824 Custom);
825 }
826 }
827
829
830 if (Subtarget->has16BitInsts()) {
832 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
834 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
835 } else {
836 // Legalization hack.
837 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
838
840 }
841
843 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
844 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
845 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
846 MVT::v32f16, MVT::v32bf16},
847 Custom);
848
850
851 if (Subtarget->hasScalarSMulU64())
853
854 if (Subtarget->hasMad64_32())
856
857 if (Subtarget->hasPrefetch())
859
860 if (Subtarget->hasIEEEMinMax()) {
862 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
864 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
865 Custom);
866 } else {
867 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
868 if (Subtarget->hasMinimum3Maximum3F32())
870
871 if (Subtarget->hasMinimum3Maximum3PKF16())
873 }
874
876 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
877 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
878 MVT::i8},
879 Custom);
880
882 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
883 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
884 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
885 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
886 Custom);
887
889 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
890 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
891 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
893 Custom);
894
900
901 // TODO: Could move this to custom lowering, could benefit from combines on
902 // extract of relevant bits.
904
906
907 if (Subtarget->hasBF16ConversionInsts()) {
911 }
912
913 if (Subtarget->hasCvtPkF16F32Inst()) {
915 }
916
919 ISD::SUB,
921 ISD::MUL,
922 ISD::FADD,
923 ISD::FSUB,
924 ISD::FDIV,
925 ISD::FMUL,
932 ISD::FMA,
933 ISD::SMIN,
934 ISD::SMAX,
935 ISD::UMIN,
936 ISD::UMAX,
939 ISD::SMIN,
940 ISD::SMAX,
941 ISD::UMIN,
942 ISD::UMAX,
943 ISD::AND,
944 ISD::OR,
945 ISD::XOR,
946 ISD::SHL,
947 ISD::SRL,
948 ISD::SRA,
949 ISD::FSHR,
959
960 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
962
963 // All memory operations. Some folding on the pointer operand is done to help
964 // matching the constant offsets in the addressing modes.
989
990 // FIXME: In other contexts we pretend this is a per-function property.
992
994}
995
996const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
997
999 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1000 return RCRegs;
1001}
1002
1003//===----------------------------------------------------------------------===//
1004// TargetLowering queries
1005//===----------------------------------------------------------------------===//
1006
1007// v_mad_mix* support a conversion from f16 to f32.
1008//
1009// There is only one special case when denormals are enabled we don't currently,
1010// where this is OK to use.
1011bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1012 EVT DestVT, EVT SrcVT) const {
1013 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1014 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1015 DestVT.getScalarType() == MVT::f32 &&
1016 SrcVT.getScalarType() == MVT::f16 &&
1017 // TODO: This probably only requires no input flushing?
1019}
1020
1022 LLT DestTy, LLT SrcTy) const {
1023 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1024 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1025 DestTy.getScalarSizeInBits() == 32 &&
1026 SrcTy.getScalarSizeInBits() == 16 &&
1027 // TODO: This probably only requires no input flushing?
1028 denormalModeIsFlushAllF32(*MI.getMF());
1029}
1030
1032 // SI has some legal vector types, but no legal vector operations. Say no
1033 // shuffles are legal in order to prefer scalarizing some vector operations.
1034 return false;
1035}
1036
1039 EVT VT) const {
1042
1043 if (VT.isVector()) {
1044 EVT ScalarVT = VT.getScalarType();
1045 unsigned Size = ScalarVT.getSizeInBits();
1046 if (Size == 16) {
1047 if (Subtarget->has16BitInsts()) {
1048 if (VT.isInteger())
1049 return MVT::v2i16;
1050 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1051 }
1052 return VT.isInteger() ? MVT::i32 : MVT::f32;
1053 }
1054
1055 if (Size < 16)
1056 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1057 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1058 }
1059
1060 if (VT.getSizeInBits() > 32)
1061 return MVT::i32;
1062
1064}
1065
1068 EVT VT) const {
1071
1072 if (VT.isVector()) {
1073 unsigned NumElts = VT.getVectorNumElements();
1074 EVT ScalarVT = VT.getScalarType();
1075 unsigned Size = ScalarVT.getSizeInBits();
1076
1077 // FIXME: Should probably promote 8-bit vectors to i16.
1078 if (Size == 16 && Subtarget->has16BitInsts())
1079 return (NumElts + 1) / 2;
1080
1081 if (Size <= 32)
1082 return NumElts;
1083
1084 if (Size > 32)
1085 return NumElts * ((Size + 31) / 32);
1086 } else if (VT.getSizeInBits() > 32)
1087 return (VT.getSizeInBits() + 31) / 32;
1088
1090}
1091
1093 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1094 unsigned &NumIntermediates, MVT &RegisterVT) const {
1095 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1096 unsigned NumElts = VT.getVectorNumElements();
1097 EVT ScalarVT = VT.getScalarType();
1098 unsigned Size = ScalarVT.getSizeInBits();
1099 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1100 // support, but unless we can properly handle 3-vectors, it will be still be
1101 // inconsistent.
1102 if (Size == 16 && Subtarget->has16BitInsts()) {
1103 if (ScalarVT == MVT::bf16) {
1104 RegisterVT = MVT::i32;
1105 IntermediateVT = MVT::v2bf16;
1106 } else {
1107 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1108 IntermediateVT = RegisterVT;
1109 }
1110 NumIntermediates = (NumElts + 1) / 2;
1111 return NumIntermediates;
1112 }
1113
1114 if (Size == 32) {
1115 RegisterVT = ScalarVT.getSimpleVT();
1116 IntermediateVT = RegisterVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1119 }
1120
1121 if (Size < 16 && Subtarget->has16BitInsts()) {
1122 // FIXME: Should probably form v2i16 pieces
1123 RegisterVT = MVT::i16;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1127 }
1128
1129 if (Size != 16 && Size <= 32) {
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = ScalarVT;
1132 NumIntermediates = NumElts;
1133 return NumIntermediates;
1134 }
1135
1136 if (Size > 32) {
1137 RegisterVT = MVT::i32;
1138 IntermediateVT = RegisterVT;
1139 NumIntermediates = NumElts * ((Size + 31) / 32);
1140 return NumIntermediates;
1141 }
1142 }
1143
1145 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1146}
1147
1149 const DataLayout &DL, Type *Ty,
1150 unsigned MaxNumLanes) {
1151 assert(MaxNumLanes != 0);
1152
1153 LLVMContext &Ctx = Ty->getContext();
1154 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1155 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1156 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1157 NumElts);
1158 }
1159
1160 return TLI.getValueType(DL, Ty);
1161}
1162
1163// Peek through TFE struct returns to only use the data size.
1165 const DataLayout &DL, Type *Ty,
1166 unsigned MaxNumLanes) {
1167 auto *ST = dyn_cast<StructType>(Ty);
1168 if (!ST)
1169 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1170
1171 // TFE intrinsics return an aggregate type.
1172 assert(ST->getNumContainedTypes() == 2 &&
1173 ST->getContainedType(1)->isIntegerTy(32));
1174 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1175}
1176
1177/// Map address space 7 to MVT::v5i32 because that's its in-memory
1178/// representation. This return value is vector-typed because there is no
1179/// MVT::i160 and it is not clear if one can be added. While this could
1180/// cause issues during codegen, these address space 7 pointers will be
1181/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1182/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1183/// modeling, to work.
1185 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1186 return MVT::v5i32;
1188 DL.getPointerSizeInBits(AS) == 192)
1189 return MVT::v6i32;
1191}
1192/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1193/// v8i32 when padding is added.
1194/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1195/// also v8i32 with padding.
1197 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1198 DL.getPointerSizeInBits(AS) == 160) ||
1200 DL.getPointerSizeInBits(AS) == 192))
1201 return MVT::v8i32;
1203}
1204
1206 const CallInst &CI,
1207 MachineFunction &MF,
1208 unsigned IntrID) const {
1210 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1212 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1214 Info.flags |= getTargetMMOFlags(CI);
1215
1216 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1218 AttributeList Attr =
1220 MemoryEffects ME = Attr.getMemoryEffects();
1221 if (ME.doesNotAccessMemory())
1222 return false;
1223
1224 // TODO: Should images get their own address space?
1225 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1226
1227 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1228 if (RsrcIntr->IsImage) {
1231 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1232 Info.align.reset();
1233 }
1234
1235 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1236 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1237 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1238 // We conservatively set the memory operand of a buffer intrinsic to the
1239 // base resource pointer, so that we can access alias information about
1240 // those pointers. Cases like "this points at the same value
1241 // but with a different offset" are handled in
1242 // areMemAccessesTriviallyDisjoint.
1243 Info.ptrVal = RsrcArg;
1244 }
1245
1246 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1247 if (!IsSPrefetch) {
1248 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1249 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1251 }
1252
1254 if (ME.onlyReadsMemory()) {
1255 if (RsrcIntr->IsImage) {
1256 unsigned MaxNumLanes = 4;
1257
1258 if (!BaseOpcode->Gather4) {
1259 // If this isn't a gather, we may have excess loaded elements in the
1260 // IR type. Check the dmask for the real number of elements loaded.
1261 unsigned DMask =
1262 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1263 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1264 }
1265
1266 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1267 CI.getType(), MaxNumLanes);
1268 } else {
1269 Info.memVT =
1271 std::numeric_limits<unsigned>::max());
1272 }
1273
1274 // FIXME: What does alignment mean for an image?
1277 } else if (ME.onlyWritesMemory()) {
1279
1280 Type *DataTy = CI.getArgOperand(0)->getType();
1281 if (RsrcIntr->IsImage) {
1282 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1283 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1284 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1285 DMaskLanes);
1286 } else
1287 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1288
1290 } else {
1291 // Atomic, NoReturn Sampler or prefetch
1294 Info.flags |=
1296
1297 if (!IsSPrefetch)
1299
1300 switch (IntrID) {
1301 default:
1302 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1303 // Fake memory access type for no return sampler intrinsics
1304 Info.memVT = MVT::i32;
1305 } else {
1306 // XXX - Should this be volatile without known ordering?
1308 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1309 }
1310 break;
1311 case Intrinsic::amdgcn_raw_buffer_load_lds:
1312 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1313 case Intrinsic::amdgcn_struct_buffer_load_lds:
1314 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1315 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1316 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1317 Info.ptrVal = CI.getArgOperand(1);
1318 return true;
1319 }
1320 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1321 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1322 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1323 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1324 Info.memVT =
1326 std::numeric_limits<unsigned>::max());
1327 Info.flags &= ~MachineMemOperand::MOStore;
1328 return true;
1329 }
1330 }
1331 }
1332 return true;
1333 }
1334
1335 switch (IntrID) {
1336 case Intrinsic::amdgcn_ds_ordered_add:
1337 case Intrinsic::amdgcn_ds_ordered_swap: {
1339 Info.memVT = MVT::getVT(CI.getType());
1340 Info.ptrVal = CI.getOperand(0);
1341 Info.align.reset();
1343
1344 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1345 if (!Vol->isZero())
1347
1348 return true;
1349 }
1350 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1351 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1353 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1354 Info.ptrVal = nullptr;
1355 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1357 return true;
1358 }
1359 case Intrinsic::amdgcn_ds_append:
1360 case Intrinsic::amdgcn_ds_consume: {
1362 Info.memVT = MVT::getVT(CI.getType());
1363 Info.ptrVal = CI.getOperand(0);
1364 Info.align.reset();
1366
1367 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1368 if (!Vol->isZero())
1370
1371 return true;
1372 }
1373 case Intrinsic::amdgcn_global_atomic_csub: {
1375 Info.memVT = MVT::getVT(CI.getType());
1376 Info.ptrVal = CI.getOperand(0);
1377 Info.align.reset();
1380 return true;
1381 }
1382 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1384 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1385
1386 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1387 Info.align.reset();
1388 Info.flags |=
1390 return true;
1391 }
1392 case Intrinsic::amdgcn_global_atomic_fmin_num:
1393 case Intrinsic::amdgcn_global_atomic_fmax_num:
1394 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1395 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1396 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1397 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1399 Info.memVT = MVT::getVT(CI.getType());
1400 Info.ptrVal = CI.getOperand(0);
1401 Info.align.reset();
1405 return true;
1406 }
1407 case Intrinsic::amdgcn_global_load_tr_b64:
1408 case Intrinsic::amdgcn_global_load_tr_b128:
1409 case Intrinsic::amdgcn_ds_read_tr4_b64:
1410 case Intrinsic::amdgcn_ds_read_tr6_b96:
1411 case Intrinsic::amdgcn_ds_read_tr8_b64:
1412 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1414 Info.memVT = MVT::getVT(CI.getType());
1415 Info.ptrVal = CI.getOperand(0);
1416 Info.align.reset();
1418 return true;
1419 }
1420 case Intrinsic::amdgcn_ds_gws_init:
1421 case Intrinsic::amdgcn_ds_gws_barrier:
1422 case Intrinsic::amdgcn_ds_gws_sema_v:
1423 case Intrinsic::amdgcn_ds_gws_sema_br:
1424 case Intrinsic::amdgcn_ds_gws_sema_p:
1425 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1427
1428 const GCNTargetMachine &TM =
1429 static_cast<const GCNTargetMachine &>(getTargetMachine());
1430
1432 Info.ptrVal = MFI->getGWSPSV(TM);
1433
1434 // This is an abstract access, but we need to specify a type and size.
1435 Info.memVT = MVT::i32;
1436 Info.size = 4;
1437 Info.align = Align(4);
1438
1439 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1441 else
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_global_load_lds: {
1447 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1448 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1449 Info.ptrVal = CI.getArgOperand(1);
1451 return true;
1452 }
1453 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1455
1456 const GCNTargetMachine &TM =
1457 static_cast<const GCNTargetMachine &>(getTargetMachine());
1458
1460 Info.ptrVal = MFI->getGWSPSV(TM);
1461
1462 // This is an abstract access, but we need to specify a type and size.
1463 Info.memVT = MVT::i32;
1464 Info.size = 4;
1465 Info.align = Align(4);
1466
1468 return true;
1469 }
1470 case Intrinsic::amdgcn_s_prefetch_data: {
1472 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1473 Info.ptrVal = CI.getArgOperand(0);
1475 return true;
1476 }
1477 default:
1478 return false;
1479 }
1480}
1481
1483 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1484 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1485 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1486 // The DAG's ValueType loses the addrspaces.
1487 // Add them as 2 extra Constant operands "from" and "to".
1488 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1489 unsigned DstAS = I.getType()->getPointerAddressSpace();
1490 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1491 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1492 break;
1493 }
1494 default:
1495 break;
1496 }
1497}
1498
1501 Type *&AccessTy) const {
1502 Value *Ptr = nullptr;
1503 switch (II->getIntrinsicID()) {
1504 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1505 case Intrinsic::amdgcn_ds_append:
1506 case Intrinsic::amdgcn_ds_consume:
1507 case Intrinsic::amdgcn_ds_read_tr4_b64:
1508 case Intrinsic::amdgcn_ds_read_tr6_b96:
1509 case Intrinsic::amdgcn_ds_read_tr8_b64:
1510 case Intrinsic::amdgcn_ds_read_tr16_b64:
1511 case Intrinsic::amdgcn_ds_ordered_add:
1512 case Intrinsic::amdgcn_ds_ordered_swap:
1513 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1514 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1515 case Intrinsic::amdgcn_global_atomic_csub:
1516 case Intrinsic::amdgcn_global_atomic_fmax_num:
1517 case Intrinsic::amdgcn_global_atomic_fmin_num:
1518 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1519 case Intrinsic::amdgcn_global_load_tr_b64:
1520 case Intrinsic::amdgcn_global_load_tr_b128:
1521 Ptr = II->getArgOperand(0);
1522 break;
1523 case Intrinsic::amdgcn_global_load_lds:
1524 Ptr = II->getArgOperand(1);
1525 break;
1526 default:
1527 return false;
1528 }
1529 AccessTy = II->getType();
1530 Ops.push_back(Ptr);
1531 return true;
1532}
1533
1535 unsigned AddrSpace) const {
1536 if (!Subtarget->hasFlatInstOffsets()) {
1537 // Flat instructions do not have offsets, and only have the register
1538 // address.
1539 return AM.BaseOffs == 0 && AM.Scale == 0;
1540 }
1541
1542 decltype(SIInstrFlags::FLAT) FlatVariant =
1546
1547 return AM.Scale == 0 &&
1548 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1549 AM.BaseOffs, AddrSpace, FlatVariant));
1550}
1551
1553 if (Subtarget->hasFlatGlobalInsts())
1555
1556 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1557 // Assume the we will use FLAT for all global memory accesses
1558 // on VI.
1559 // FIXME: This assumption is currently wrong. On VI we still use
1560 // MUBUF instructions for the r + i addressing mode. As currently
1561 // implemented, the MUBUF instructions only work on buffer < 4GB.
1562 // It may be possible to support > 4GB buffers with MUBUF instructions,
1563 // by setting the stride value in the resource descriptor which would
1564 // increase the size limit to (stride * 4GB). However, this is risky,
1565 // because it has never been validated.
1567 }
1568
1569 return isLegalMUBUFAddressingMode(AM);
1570}
1571
1572bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1573 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1574 // additionally can do r + r + i with addr64. 32-bit has more addressing
1575 // mode options. Depending on the resource constant, it can also do
1576 // (i64 r0) + (i32 r1) * (i14 i).
1577 //
1578 // Private arrays end up using a scratch buffer most of the time, so also
1579 // assume those use MUBUF instructions. Scratch loads / stores are currently
1580 // implemented as mubuf instructions with offen bit set, so slightly
1581 // different than the normal addr64.
1582 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1583 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1584 return false;
1585
1586 // FIXME: Since we can split immediate into soffset and immediate offset,
1587 // would it make sense to allow any immediate?
1588
1589 switch (AM.Scale) {
1590 case 0: // r + i or just i, depending on HasBaseReg.
1591 return true;
1592 case 1:
1593 return true; // We have r + r or r + i.
1594 case 2:
1595 if (AM.HasBaseReg) {
1596 // Reject 2 * r + r.
1597 return false;
1598 }
1599
1600 // Allow 2 * r as r + r
1601 // Or 2 * r + i is allowed as r + r + i.
1602 return true;
1603 default: // Don't allow n * r
1604 return false;
1605 }
1606}
1607
1609 const AddrMode &AM, Type *Ty,
1610 unsigned AS,
1611 Instruction *I) const {
1612 // No global is ever allowed as a base.
1613 if (AM.BaseGV)
1614 return false;
1615
1616 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1617 return isLegalGlobalAddressingMode(AM);
1618
1619 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1623 // If the offset isn't a multiple of 4, it probably isn't going to be
1624 // correctly aligned.
1625 // FIXME: Can we get the real alignment here?
1626 if (AM.BaseOffs % 4 != 0)
1627 return isLegalMUBUFAddressingMode(AM);
1628
1629 if (!Subtarget->hasScalarSubwordLoads()) {
1630 // There are no SMRD extloads, so if we have to do a small type access we
1631 // will use a MUBUF load.
1632 // FIXME?: We also need to do this if unaligned, but we don't know the
1633 // alignment here.
1634 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1635 return isLegalGlobalAddressingMode(AM);
1636 }
1637
1639 // SMRD instructions have an 8-bit, dword offset on SI.
1640 if (!isUInt<8>(AM.BaseOffs / 4))
1641 return false;
1642 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1643 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1644 // in 8-bits, it can use a smaller encoding.
1645 if (!isUInt<32>(AM.BaseOffs / 4))
1646 return false;
1647 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1648 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1649 if (!isUInt<20>(AM.BaseOffs))
1650 return false;
1651 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1652 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1653 // for S_BUFFER_* instructions).
1654 if (!isInt<21>(AM.BaseOffs))
1655 return false;
1656 } else {
1657 // On GFX12, all offsets are signed 24-bit in bytes.
1658 if (!isInt<24>(AM.BaseOffs))
1659 return false;
1660 }
1661
1662 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1664 AM.BaseOffs < 0) {
1665 // Scalar (non-buffer) loads can only use a negative offset if
1666 // soffset+offset is non-negative. Since the compiler can only prove that
1667 // in a few special cases, it is safer to claim that negative offsets are
1668 // not supported.
1669 return false;
1670 }
1671
1672 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1673 return true;
1674
1675 if (AM.Scale == 1 && AM.HasBaseReg)
1676 return true;
1677
1678 return false;
1679 }
1680
1681 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1682 return Subtarget->enableFlatScratch()
1684 : isLegalMUBUFAddressingMode(AM);
1685
1686 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1687 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1688 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1689 // field.
1690 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1691 // an 8-bit dword offset but we don't know the alignment here.
1692 if (!isUInt<16>(AM.BaseOffs))
1693 return false;
1694
1695 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1696 return true;
1697
1698 if (AM.Scale == 1 && AM.HasBaseReg)
1699 return true;
1700
1701 return false;
1702 }
1703
1705 // For an unknown address space, this usually means that this is for some
1706 // reason being used for pure arithmetic, and not based on some addressing
1707 // computation. We don't have instructions that compute pointers with any
1708 // addressing modes, so treat them as having no offset like flat
1709 // instructions.
1711 }
1712
1713 // Assume a user alias of global for unknown address spaces.
1714 return isLegalGlobalAddressingMode(AM);
1715}
1716
1718 const MachineFunction &MF) const {
1720 return (MemVT.getSizeInBits() <= 4 * 32);
1721 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1722 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1723 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1724 }
1726 return (MemVT.getSizeInBits() <= 2 * 32);
1727 return true;
1728}
1729
1731 unsigned Size, unsigned AddrSpace, Align Alignment,
1732 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1733 if (IsFast)
1734 *IsFast = 0;
1735
1736 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1737 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1738 // Check if alignment requirements for ds_read/write instructions are
1739 // disabled.
1740 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1741 return false;
1742
1743 Align RequiredAlignment(
1744 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1745 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1746 Alignment < RequiredAlignment)
1747 return false;
1748
1749 // Either, the alignment requirements are "enabled", or there is an
1750 // unaligned LDS access related hardware bug though alignment requirements
1751 // are "disabled". In either case, we need to check for proper alignment
1752 // requirements.
1753 //
1754 switch (Size) {
1755 case 64:
1756 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1757 // address is negative, then the instruction is incorrectly treated as
1758 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1759 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1760 // load later in the SILoadStoreOptimizer.
1761 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1762 return false;
1763
1764 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1765 // can do a 4 byte aligned, 8 byte access in a single operation using
1766 // ds_read2/write2_b32 with adjacent offsets.
1767 RequiredAlignment = Align(4);
1768
1769 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1770 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1771 // ds_write2_b32 depending on the alignment. In either case with either
1772 // alignment there is no faster way of doing this.
1773
1774 // The numbers returned here and below are not additive, it is a 'speed
1775 // rank'. They are just meant to be compared to decide if a certain way
1776 // of lowering an operation is faster than another. For that purpose
1777 // naturally aligned operation gets it bitsize to indicate that "it
1778 // operates with a speed comparable to N-bit wide load". With the full
1779 // alignment ds128 is slower than ds96 for example. If underaligned it
1780 // is comparable to a speed of a single dword access, which would then
1781 // mean 32 < 128 and it is faster to issue a wide load regardless.
1782 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1783 // wider load which will not be aligned anymore the latter is slower.
1784 if (IsFast)
1785 *IsFast = (Alignment >= RequiredAlignment) ? 64
1786 : (Alignment < Align(4)) ? 32
1787 : 1;
1788 return true;
1789 }
1790
1791 break;
1792 case 96:
1793 if (!Subtarget->hasDS96AndDS128())
1794 return false;
1795
1796 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1797 // gfx8 and older.
1798
1799 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1800 // Naturally aligned access is fastest. However, also report it is Fast
1801 // if memory is aligned less than DWORD. A narrow load or store will be
1802 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1803 // be more of them, so overall we will pay less penalty issuing a single
1804 // instruction.
1805
1806 // See comment on the values above.
1807 if (IsFast)
1808 *IsFast = (Alignment >= RequiredAlignment) ? 96
1809 : (Alignment < Align(4)) ? 32
1810 : 1;
1811 return true;
1812 }
1813
1814 break;
1815 case 128:
1816 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1817 return false;
1818
1819 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1820 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1821 // single operation using ds_read2/write2_b64.
1822 RequiredAlignment = Align(8);
1823
1824 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1825 // Naturally aligned access is fastest. However, also report it is Fast
1826 // if memory is aligned less than DWORD. A narrow load or store will be
1827 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1828 // will be more of them, so overall we will pay less penalty issuing a
1829 // single instruction.
1830
1831 // See comment on the values above.
1832 if (IsFast)
1833 *IsFast = (Alignment >= RequiredAlignment) ? 128
1834 : (Alignment < Align(4)) ? 32
1835 : 1;
1836 return true;
1837 }
1838
1839 break;
1840 default:
1841 if (Size > 32)
1842 return false;
1843
1844 break;
1845 }
1846
1847 // See comment on the values above.
1848 // Note that we have a single-dword or sub-dword here, so if underaligned
1849 // it is a slowest possible access, hence returned value is 0.
1850 if (IsFast)
1851 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1852
1853 return Alignment >= RequiredAlignment ||
1854 Subtarget->hasUnalignedDSAccessEnabled();
1855 }
1856
1857 // FIXME: We have to be conservative here and assume that flat operations
1858 // will access scratch. If we had access to the IR function, then we
1859 // could determine if any private memory was used in the function.
1860 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1861 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1862 bool AlignedBy4 = Alignment >= Align(4);
1863 if (IsFast)
1864 *IsFast = AlignedBy4;
1865
1866 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1867 }
1868
1869 // So long as they are correct, wide global memory operations perform better
1870 // than multiple smaller memory ops -- even when misaligned
1871 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1872 if (IsFast)
1873 *IsFast = Size;
1874
1875 return Alignment >= Align(4) ||
1877 }
1878
1879 // Smaller than dword value must be aligned.
1880 if (Size < 32)
1881 return false;
1882
1883 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1884 // byte-address are ignored, thus forcing Dword alignment.
1885 // This applies to private, global, and constant memory.
1886 if (IsFast)
1887 *IsFast = 1;
1888
1889 return Size >= 32 && Alignment >= Align(4);
1890}
1891
1893 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1894 unsigned *IsFast) const {
1896 Alignment, Flags, IsFast);
1897}
1898
1900 const MemOp &Op, const AttributeList &FuncAttributes) const {
1901 // FIXME: Should account for address space here.
1902
1903 // The default fallback uses the private pointer size as a guess for a type to
1904 // use. Make sure we switch these to 64-bit accesses.
1905
1906 if (Op.size() >= 16 &&
1907 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1908 return MVT::v4i32;
1909
1910 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1911 return MVT::v2i32;
1912
1913 // Use the default.
1914 return MVT::Other;
1915}
1916
1918 const MemSDNode *MemNode = cast<MemSDNode>(N);
1919 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1920}
1921
1923 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1925}
1926
1928 unsigned DestAS) const {
1929 // Flat -> private/local is a simple truncate.
1930 // Flat -> global is no-op
1931 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1932 return true;
1933
1934 const GCNTargetMachine &TM =
1935 static_cast<const GCNTargetMachine &>(getTargetMachine());
1936 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1937}
1938
1941 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1942 VT.getScalarType().bitsLE(MVT::i16))
1945}
1946
1948 Type *Ty) const {
1949 // FIXME: Could be smarter if called for vector constants.
1950 return true;
1951}
1952
1954 unsigned Index) const {
1956 return false;
1957
1958 // TODO: Add more cases that are cheap.
1959 return Index == 0;
1960}
1961
1962bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
1963 // TODO: This should be more aggressive, particular for 16-bit element
1964 // vectors. However there are some mixed improvements and regressions.
1965 EVT EltTy = VT.getVectorElementType();
1966 return EltTy.getSizeInBits() % 32 == 0;
1967}
1968
1970 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1971 switch (Op) {
1972 case ISD::LOAD:
1973 case ISD::STORE:
1974 return true;
1975 default:
1976 return false;
1977 }
1978 }
1979
1980 // SimplifySetCC uses this function to determine whether or not it should
1981 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1982 if (VT == MVT::i1 && Op == ISD::SETCC)
1983 return false;
1984
1986}
1987
1988SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1989 const SDLoc &SL,
1990 SDValue Chain,
1991 uint64_t Offset) const {
1992 const DataLayout &DL = DAG.getDataLayout();
1996
1997 auto [InputPtrReg, RC, ArgTy] =
1999
2000 // We may not have the kernarg segment argument if we have no kernel
2001 // arguments.
2002 if (!InputPtrReg)
2003 return DAG.getConstant(Offset, SL, PtrVT);
2004
2006 SDValue BasePtr = DAG.getCopyFromReg(
2007 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2008
2009 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2010}
2011
2012SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2013 const SDLoc &SL) const {
2016 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2017}
2018
2019SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2020 const SDLoc &SL) const {
2021
2023 std::optional<uint32_t> KnownSize =
2025 if (KnownSize.has_value())
2026 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2027 return SDValue();
2028}
2029
2030SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2031 const SDLoc &SL, SDValue Val,
2032 bool Signed,
2033 const ISD::InputArg *Arg) const {
2034 // First, if it is a widened vector, narrow it.
2035 if (VT.isVector() &&
2037 EVT NarrowedVT =
2040 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2041 DAG.getConstant(0, SL, MVT::i32));
2042 }
2043
2044 // Then convert the vector elements or scalar value.
2045 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2046 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2047 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2048 }
2049
2050 if (MemVT.isFloatingPoint())
2051 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2052 else if (Signed)
2053 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2054 else
2055 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2056
2057 return Val;
2058}
2059
2060SDValue SITargetLowering::lowerKernargMemParameter(
2061 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2062 uint64_t Offset, Align Alignment, bool Signed,
2063 const ISD::InputArg *Arg) const {
2065
2066 // Try to avoid using an extload by loading earlier than the argument address,
2067 // and extracting the relevant bits. The load should hopefully be merged with
2068 // the previous argument.
2069 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2070 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2071 int64_t AlignDownOffset = alignDown(Offset, 4);
2072 int64_t OffsetDiff = Offset - AlignDownOffset;
2073
2074 EVT IntVT = MemVT.changeTypeToInteger();
2075
2076 // TODO: If we passed in the base kernel offset we could have a better
2077 // alignment than 4, but we don't really need it.
2078 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2079 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2082
2083 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2084 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2085
2086 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2087 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2088 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2089
2090 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2091 }
2092
2093 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2094 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2097
2098 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2099 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2100}
2101
2102SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2103 CCValAssign &VA, const SDLoc &SL,
2104 SDValue Chain,
2105 const ISD::InputArg &Arg) const {
2107 MachineFrameInfo &MFI = MF.getFrameInfo();
2108
2109 if (Arg.Flags.isByVal()) {
2110 unsigned Size = Arg.Flags.getByValSize();
2111 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2112 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2113 }
2114
2115 unsigned ArgOffset = VA.getLocMemOffset();
2116 unsigned ArgSize = VA.getValVT().getStoreSize();
2117
2118 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2119
2120 // Create load nodes to retrieve arguments from the stack.
2121 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2122 SDValue ArgValue;
2123
2124 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2126 MVT MemVT = VA.getValVT();
2127
2128 switch (VA.getLocInfo()) {
2129 default:
2130 break;
2131 case CCValAssign::BCvt:
2132 MemVT = VA.getLocVT();
2133 break;
2134 case CCValAssign::SExt:
2135 ExtType = ISD::SEXTLOAD;
2136 break;
2137 case CCValAssign::ZExt:
2138 ExtType = ISD::ZEXTLOAD;
2139 break;
2140 case CCValAssign::AExt:
2141 ExtType = ISD::EXTLOAD;
2142 break;
2143 }
2144
2145 ArgValue = DAG.getExtLoad(
2146 ExtType, SL, VA.getLocVT(), Chain, FIN,
2148 return ArgValue;
2149}
2150
2151SDValue SITargetLowering::getPreloadedValue(
2152 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2154 const ArgDescriptor *Reg = nullptr;
2155 const TargetRegisterClass *RC;
2156 LLT Ty;
2157
2159 const ArgDescriptor WorkGroupIDX =
2160 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2161 // If GridZ is not programmed in an entry function then the hardware will set
2162 // it to all zeros, so there is no need to mask the GridY value in the low
2163 // order bits.
2164 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2165 AMDGPU::TTMP7,
2166 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2167 const ArgDescriptor WorkGroupIDZ =
2168 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2169 if (Subtarget->hasArchitectedSGPRs() &&
2171 switch (PVID) {
2173 Reg = &WorkGroupIDX;
2174 RC = &AMDGPU::SReg_32RegClass;
2175 Ty = LLT::scalar(32);
2176 break;
2178 Reg = &WorkGroupIDY;
2179 RC = &AMDGPU::SReg_32RegClass;
2180 Ty = LLT::scalar(32);
2181 break;
2183 Reg = &WorkGroupIDZ;
2184 RC = &AMDGPU::SReg_32RegClass;
2185 Ty = LLT::scalar(32);
2186 break;
2187 default:
2188 break;
2189 }
2190 }
2191
2192 if (!Reg)
2193 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2194 if (!Reg) {
2196 // It's possible for a kernarg intrinsic call to appear in a kernel with
2197 // no allocated segment, in which case we do not add the user sgpr
2198 // argument, so just return null.
2199 return DAG.getConstant(0, SDLoc(), VT);
2200 }
2201
2202 // It's undefined behavior if a function marked with the amdgpu-no-*
2203 // attributes uses the corresponding intrinsic.
2204 return DAG.getUNDEF(VT);
2205 }
2206
2207 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2208}
2209
2211 CallingConv::ID CallConv,
2212 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2213 FunctionType *FType,
2214 SIMachineFunctionInfo *Info) {
2215 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2216 const ISD::InputArg *Arg = &Ins[I];
2217
2218 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2219 "vector type argument should have been split");
2220
2221 // First check if it's a PS input addr.
2222 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2223 PSInputNum <= 15) {
2224 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2225
2226 // Inconveniently only the first part of the split is marked as isSplit,
2227 // so skip to the end. We only want to increment PSInputNum once for the
2228 // entire split argument.
2229 if (Arg->Flags.isSplit()) {
2230 while (!Arg->Flags.isSplitEnd()) {
2231 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2232 "unexpected vector split in ps argument type");
2233 if (!SkipArg)
2234 Splits.push_back(*Arg);
2235 Arg = &Ins[++I];
2236 }
2237 }
2238
2239 if (SkipArg) {
2240 // We can safely skip PS inputs.
2241 Skipped.set(Arg->getOrigArgIndex());
2242 ++PSInputNum;
2243 continue;
2244 }
2245
2246 Info->markPSInputAllocated(PSInputNum);
2247 if (Arg->Used)
2248 Info->markPSInputEnabled(PSInputNum);
2249
2250 ++PSInputNum;
2251 }
2252
2253 Splits.push_back(*Arg);
2254 }
2255}
2256
2257// Allocate special inputs passed in VGPRs.
2259 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2260 SIMachineFunctionInfo &Info) const {
2261 const LLT S32 = LLT::scalar(32);
2263
2264 if (Info.hasWorkItemIDX()) {
2265 Register Reg = AMDGPU::VGPR0;
2266 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2267
2268 CCInfo.AllocateReg(Reg);
2269 unsigned Mask =
2270 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2271 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2272 }
2273
2274 if (Info.hasWorkItemIDY()) {
2275 assert(Info.hasWorkItemIDX());
2276 if (Subtarget->hasPackedTID()) {
2277 Info.setWorkItemIDY(
2278 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2279 } else {
2280 unsigned Reg = AMDGPU::VGPR1;
2281 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2282
2283 CCInfo.AllocateReg(Reg);
2284 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2285 }
2286 }
2287
2288 if (Info.hasWorkItemIDZ()) {
2289 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2290 if (Subtarget->hasPackedTID()) {
2291 Info.setWorkItemIDZ(
2292 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2293 } else {
2294 unsigned Reg = AMDGPU::VGPR2;
2295 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2296
2297 CCInfo.AllocateReg(Reg);
2298 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2299 }
2300 }
2301}
2302
2303// Try to allocate a VGPR at the end of the argument list, or if no argument
2304// VGPRs are left allocating a stack slot.
2305// If \p Mask is is given it indicates bitfield position in the register.
2306// If \p Arg is given use it with new ]p Mask instead of allocating new.
2307static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2308 ArgDescriptor Arg = ArgDescriptor()) {
2309 if (Arg.isSet())
2310 return ArgDescriptor::createArg(Arg, Mask);
2311
2312 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2313 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2314 if (RegIdx == ArgVGPRs.size()) {
2315 // Spill to stack required.
2316 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2317
2318 return ArgDescriptor::createStack(Offset, Mask);
2319 }
2320
2321 unsigned Reg = ArgVGPRs[RegIdx];
2322 Reg = CCInfo.AllocateReg(Reg);
2323 assert(Reg != AMDGPU::NoRegister);
2324
2325 MachineFunction &MF = CCInfo.getMachineFunction();
2326 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2327 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2328 return ArgDescriptor::createRegister(Reg, Mask);
2329}
2330
2332 const TargetRegisterClass *RC,
2333 unsigned NumArgRegs) {
2334 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2335 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2336 if (RegIdx == ArgSGPRs.size())
2337 report_fatal_error("ran out of SGPRs for arguments");
2338
2339 unsigned Reg = ArgSGPRs[RegIdx];
2340 Reg = CCInfo.AllocateReg(Reg);
2341 assert(Reg != AMDGPU::NoRegister);
2342
2343 MachineFunction &MF = CCInfo.getMachineFunction();
2344 MF.addLiveIn(Reg, RC);
2346}
2347
2348// If this has a fixed position, we still should allocate the register in the
2349// CCInfo state. Technically we could get away with this for values passed
2350// outside of the normal argument range.
2352 const TargetRegisterClass *RC,
2353 MCRegister Reg) {
2354 Reg = CCInfo.AllocateReg(Reg);
2355 assert(Reg != AMDGPU::NoRegister);
2356 MachineFunction &MF = CCInfo.getMachineFunction();
2357 MF.addLiveIn(Reg, RC);
2358}
2359
2360static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2361 if (Arg) {
2362 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2363 Arg.getRegister());
2364 } else
2365 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2366}
2367
2368static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2369 if (Arg) {
2370 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2371 Arg.getRegister());
2372 } else
2373 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2374}
2375
2376/// Allocate implicit function VGPR arguments at the end of allocated user
2377/// arguments.
2379 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2380 SIMachineFunctionInfo &Info) const {
2381 const unsigned Mask = 0x3ff;
2382 ArgDescriptor Arg;
2383
2384 if (Info.hasWorkItemIDX()) {
2385 Arg = allocateVGPR32Input(CCInfo, Mask);
2386 Info.setWorkItemIDX(Arg);
2387 }
2388
2389 if (Info.hasWorkItemIDY()) {
2390 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2391 Info.setWorkItemIDY(Arg);
2392 }
2393
2394 if (Info.hasWorkItemIDZ())
2395 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2396}
2397
2398/// Allocate implicit function VGPR arguments in fixed registers.
2400 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2401 SIMachineFunctionInfo &Info) const {
2402 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2403 if (!Reg)
2404 report_fatal_error("failed to allocated VGPR for implicit arguments");
2405
2406 const unsigned Mask = 0x3ff;
2407 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2408 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2409 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2410}
2411
2413 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2414 SIMachineFunctionInfo &Info) const {
2415 auto &ArgInfo = Info.getArgInfo();
2416 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2417
2418 // TODO: Unify handling with private memory pointers.
2419 if (UserSGPRInfo.hasDispatchPtr())
2420 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2421
2422 if (UserSGPRInfo.hasQueuePtr())
2423 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2424
2425 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2426 // constant offset from the kernarg segment.
2427 if (Info.hasImplicitArgPtr())
2428 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2429
2430 if (UserSGPRInfo.hasDispatchID())
2431 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2432
2433 // flat_scratch_init is not applicable for non-kernel functions.
2434
2435 if (Info.hasWorkGroupIDX())
2436 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2437
2438 if (Info.hasWorkGroupIDY())
2439 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2440
2441 if (Info.hasWorkGroupIDZ())
2442 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2443
2444 if (Info.hasLDSKernelId())
2445 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2446}
2447
2448// Allocate special inputs passed in user SGPRs.
2450 MachineFunction &MF,
2451 const SIRegisterInfo &TRI,
2452 SIMachineFunctionInfo &Info) const {
2453 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2454 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2455 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2456 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2457 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2458 }
2459
2460 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2461 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2462 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2463 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2464 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2465 }
2466
2467 if (UserSGPRInfo.hasDispatchPtr()) {
2468 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2469 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2470 CCInfo.AllocateReg(DispatchPtrReg);
2471 }
2472
2473 if (UserSGPRInfo.hasQueuePtr()) {
2474 Register QueuePtrReg = Info.addQueuePtr(TRI);
2475 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2476 CCInfo.AllocateReg(QueuePtrReg);
2477 }
2478
2479 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2481 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2482 CCInfo.AllocateReg(InputPtrReg);
2483
2484 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2485 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2486 }
2487
2488 if (UserSGPRInfo.hasDispatchID()) {
2489 Register DispatchIDReg = Info.addDispatchID(TRI);
2490 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2491 CCInfo.AllocateReg(DispatchIDReg);
2492 }
2493
2494 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2495 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2496 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2497 CCInfo.AllocateReg(FlatScratchInitReg);
2498 }
2499
2500 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2501 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2502 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2503 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2504 }
2505
2506 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2507 // these from the dispatch pointer.
2508}
2509
2510// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2511// sequential starting from the first argument.
2513 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2515 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2516 Function &F = MF.getFunction();
2517 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2518 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2519 bool InPreloadSequence = true;
2520 unsigned InIdx = 0;
2521 bool AlignedForImplictArgs = false;
2522 unsigned ImplicitArgOffset = 0;
2523 for (auto &Arg : F.args()) {
2524 if (!InPreloadSequence || !Arg.hasInRegAttr())
2525 break;
2526
2527 unsigned ArgIdx = Arg.getArgNo();
2528 // Don't preload non-original args or parts not in the current preload
2529 // sequence.
2530 if (InIdx < Ins.size() &&
2531 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2532 break;
2533
2534 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2535 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2536 InIdx++) {
2537 assert(ArgLocs[ArgIdx].isMemLoc());
2538 auto &ArgLoc = ArgLocs[InIdx];
2539 const Align KernelArgBaseAlign = Align(16);
2540 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2541 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2542 unsigned NumAllocSGPRs =
2543 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2544
2545 // Fix alignment for hidden arguments.
2546 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2547 if (!AlignedForImplictArgs) {
2548 ImplicitArgOffset =
2549 alignTo(LastExplicitArgOffset,
2550 Subtarget->getAlignmentForImplicitArgPtr()) -
2551 LastExplicitArgOffset;
2552 AlignedForImplictArgs = true;
2553 }
2554 ArgOffset += ImplicitArgOffset;
2555 }
2556
2557 // Arg is preloaded into the previous SGPR.
2558 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2559 assert(InIdx >= 1 && "No previous SGPR");
2560 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2561 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2562 continue;
2563 }
2564
2565 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2566 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2567 // Check for free user SGPRs for preloading.
2568 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2569 InPreloadSequence = false;
2570 break;
2571 }
2572
2573 // Preload this argument.
2574 const TargetRegisterClass *RC =
2575 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2576 SmallVectorImpl<MCRegister> *PreloadRegs =
2577 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2578
2579 if (PreloadRegs->size() > 1)
2580 RC = &AMDGPU::SGPR_32RegClass;
2581 for (auto &Reg : *PreloadRegs) {
2582 assert(Reg);
2583 MF.addLiveIn(Reg, RC);
2584 CCInfo.AllocateReg(Reg);
2585 }
2586
2587 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2588 }
2589 }
2590}
2591
2593 const SIRegisterInfo &TRI,
2594 SIMachineFunctionInfo &Info) const {
2595 // Always allocate this last since it is a synthetic preload.
2596 if (Info.hasLDSKernelId()) {
2597 Register Reg = Info.addLDSKernelId();
2598 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2599 CCInfo.AllocateReg(Reg);
2600 }
2601}
2602
2603// Allocate special input registers that are initialized per-wave.
2606 CallingConv::ID CallConv,
2607 bool IsShader) const {
2608 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2609 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2610 // Note: user SGPRs are handled by the front-end for graphics shaders
2611 // Pad up the used user SGPRs with dead inputs.
2612
2613 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2614 // before enabling architected SGPRs for workgroup IDs.
2615 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2616
2617 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2618 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2619 // rely on it to reach 16 since if we end up having no stack usage, it will
2620 // not really be added.
2621 unsigned NumRequiredSystemSGPRs =
2622 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2623 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2624 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2625 Register Reg = Info.addReservedUserSGPR();
2626 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2627 CCInfo.AllocateReg(Reg);
2628 }
2629 }
2630
2631 if (!HasArchitectedSGPRs) {
2632 if (Info.hasWorkGroupIDX()) {
2633 Register Reg = Info.addWorkGroupIDX();
2634 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2635 CCInfo.AllocateReg(Reg);
2636 }
2637
2638 if (Info.hasWorkGroupIDY()) {
2639 Register Reg = Info.addWorkGroupIDY();
2640 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2641 CCInfo.AllocateReg(Reg);
2642 }
2643
2644 if (Info.hasWorkGroupIDZ()) {
2645 Register Reg = Info.addWorkGroupIDZ();
2646 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2647 CCInfo.AllocateReg(Reg);
2648 }
2649 }
2650
2651 if (Info.hasWorkGroupInfo()) {
2652 Register Reg = Info.addWorkGroupInfo();
2653 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2654 CCInfo.AllocateReg(Reg);
2655 }
2656
2657 if (Info.hasPrivateSegmentWaveByteOffset()) {
2658 // Scratch wave offset passed in system SGPR.
2659 unsigned PrivateSegmentWaveByteOffsetReg;
2660
2661 if (IsShader) {
2662 PrivateSegmentWaveByteOffsetReg =
2663 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2664
2665 // This is true if the scratch wave byte offset doesn't have a fixed
2666 // location.
2667 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2668 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2669 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2670 }
2671 } else
2672 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2673
2674 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2675 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2676 }
2677
2678 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2679 Info.getNumPreloadedSGPRs() >= 16);
2680}
2681
2683 MachineFunction &MF,
2684 const SIRegisterInfo &TRI,
2685 SIMachineFunctionInfo &Info) {
2686 // Now that we've figured out where the scratch register inputs are, see if
2687 // should reserve the arguments and use them directly.
2688 MachineFrameInfo &MFI = MF.getFrameInfo();
2689 bool HasStackObjects = MFI.hasStackObjects();
2690 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2691
2692 // Record that we know we have non-spill stack objects so we don't need to
2693 // check all stack objects later.
2694 if (HasStackObjects)
2695 Info.setHasNonSpillStackObjects(true);
2696
2697 // Everything live out of a block is spilled with fast regalloc, so it's
2698 // almost certain that spilling will be required.
2699 if (TM.getOptLevel() == CodeGenOptLevel::None)
2700 HasStackObjects = true;
2701
2702 // For now assume stack access is needed in any callee functions, so we need
2703 // the scratch registers to pass in.
2704 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2705
2706 if (!ST.enableFlatScratch()) {
2707 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2708 // If we have stack objects, we unquestionably need the private buffer
2709 // resource. For the Code Object V2 ABI, this will be the first 4 user
2710 // SGPR inputs. We can reserve those and use them directly.
2711
2712 Register PrivateSegmentBufferReg =
2714 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2715 } else {
2716 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2717 // We tentatively reserve the last registers (skipping the last registers
2718 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2719 // we'll replace these with the ones immediately after those which were
2720 // really allocated. In the prologue copies will be inserted from the
2721 // argument to these reserved registers.
2722
2723 // Without HSA, relocations are used for the scratch pointer and the
2724 // buffer resource setup is always inserted in the prologue. Scratch wave
2725 // offset is still in an input SGPR.
2726 Info.setScratchRSrcReg(ReservedBufferReg);
2727 }
2728 }
2729
2731
2732 // For entry functions we have to set up the stack pointer if we use it,
2733 // whereas non-entry functions get this "for free". This means there is no
2734 // intrinsic advantage to using S32 over S34 in cases where we do not have
2735 // calls but do need a frame pointer (i.e. if we are requested to have one
2736 // because frame pointer elimination is disabled). To keep things simple we
2737 // only ever use S32 as the call ABI stack pointer, and so using it does not
2738 // imply we need a separate frame pointer.
2739 //
2740 // Try to use s32 as the SP, but move it if it would interfere with input
2741 // arguments. This won't work with calls though.
2742 //
2743 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2744 // registers.
2745 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2746 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2747 } else {
2749
2750 if (MFI.hasCalls())
2751 report_fatal_error("call in graphics shader with too many input SGPRs");
2752
2753 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2754 if (!MRI.isLiveIn(Reg)) {
2755 Info.setStackPtrOffsetReg(Reg);
2756 break;
2757 }
2758 }
2759
2760 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2761 report_fatal_error("failed to find register for SP");
2762 }
2763
2764 // hasFP should be accurate for entry functions even before the frame is
2765 // finalized, because it does not rely on the known stack size, only
2766 // properties like whether variable sized objects are present.
2767 if (ST.getFrameLowering()->hasFP(MF)) {
2768 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2769 }
2770}
2771
2774 return !Info->isEntryFunction();
2775}
2776
2778
2780 MachineBasicBlock *Entry,
2781 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2783
2784 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2785 if (!IStart)
2786 return;
2787
2788 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2789 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2790 MachineBasicBlock::iterator MBBI = Entry->begin();
2791 for (const MCPhysReg *I = IStart; *I; ++I) {
2792 const TargetRegisterClass *RC = nullptr;
2793 if (AMDGPU::SReg_64RegClass.contains(*I))
2794 RC = &AMDGPU::SGPR_64RegClass;
2795 else if (AMDGPU::SReg_32RegClass.contains(*I))
2796 RC = &AMDGPU::SGPR_32RegClass;
2797 else
2798 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2799
2800 Register NewVR = MRI->createVirtualRegister(RC);
2801 // Create copy from CSR to a virtual register.
2802 Entry->addLiveIn(*I);
2803 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2804 .addReg(*I);
2805
2806 // Insert the copy-back instructions right before the terminator.
2807 for (auto *Exit : Exits)
2808 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2809 TII->get(TargetOpcode::COPY), *I)
2810 .addReg(NewVR);
2811 }
2812}
2813
2815 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2816 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2817 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2819
2821 const Function &Fn = MF.getFunction();
2824
2825 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2826 DiagnosticInfoUnsupported NoGraphicsHSA(
2827 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2828 DAG.getContext()->diagnose(NoGraphicsHSA);
2829 return DAG.getEntryNode();
2830 }
2831
2834 BitVector Skipped(Ins.size());
2835 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2836 *DAG.getContext());
2837
2838 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2839 bool IsKernel = AMDGPU::isKernel(CallConv);
2840 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2841
2842 if (IsGraphics) {
2843 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2844 assert(!UserSGPRInfo.hasDispatchPtr() &&
2845 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2846 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2847 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2848 (void)UserSGPRInfo;
2849 if (!Subtarget->enableFlatScratch())
2850 assert(!UserSGPRInfo.hasFlatScratchInit());
2851 if ((CallConv != CallingConv::AMDGPU_CS &&
2852 CallConv != CallingConv::AMDGPU_Gfx) ||
2853 !Subtarget->hasArchitectedSGPRs())
2854 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2855 !Info->hasWorkGroupIDZ());
2856 }
2857
2858 if (CallConv == CallingConv::AMDGPU_PS) {
2859 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2860
2861 // At least one interpolation mode must be enabled or else the GPU will
2862 // hang.
2863 //
2864 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2865 // set PSInputAddr, the user wants to enable some bits after the compilation
2866 // based on run-time states. Since we can't know what the final PSInputEna
2867 // will look like, so we shouldn't do anything here and the user should take
2868 // responsibility for the correct programming.
2869 //
2870 // Otherwise, the following restrictions apply:
2871 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2872 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2873 // enabled too.
2874 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2875 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2876 CCInfo.AllocateReg(AMDGPU::VGPR0);
2877 CCInfo.AllocateReg(AMDGPU::VGPR1);
2878 Info->markPSInputAllocated(0);
2879 Info->markPSInputEnabled(0);
2880 }
2881 if (Subtarget->isAmdPalOS()) {
2882 // For isAmdPalOS, the user does not enable some bits after compilation
2883 // based on run-time states; the register values being generated here are
2884 // the final ones set in hardware. Therefore we need to apply the
2885 // workaround to PSInputAddr and PSInputEnable together. (The case where
2886 // a bit is set in PSInputAddr but not PSInputEnable is where the
2887 // frontend set up an input arg for a particular interpolation mode, but
2888 // nothing uses that input arg. Really we should have an earlier pass
2889 // that removes such an arg.)
2890 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2891 if ((PsInputBits & 0x7F) == 0 ||
2892 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2893 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2894 }
2895 } else if (IsKernel) {
2896 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2897 } else {
2898 Splits.append(Ins.begin(), Ins.end());
2899 }
2900
2901 if (IsKernel)
2902 analyzeFormalArgumentsCompute(CCInfo, Ins);
2903
2904 if (IsEntryFunc) {
2905 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2906 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2907 if (IsKernel && Subtarget->hasKernargPreload())
2908 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2909
2910 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2911 } else if (!IsGraphics) {
2912 // For the fixed ABI, pass workitem IDs in the last argument register.
2913 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2914
2915 // FIXME: Sink this into allocateSpecialInputSGPRs
2916 if (!Subtarget->enableFlatScratch())
2917 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2918
2919 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2920 }
2921
2922 if (!IsKernel) {
2923 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2924 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2925 }
2926
2928
2929 // FIXME: This is the minimum kernel argument alignment. We should improve
2930 // this to the maximum alignment of the arguments.
2931 //
2932 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2933 // kern arg offset.
2934 const Align KernelArgBaseAlign = Align(16);
2935
2936 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2937 const ISD::InputArg &Arg = Ins[i];
2938 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2939 InVals.push_back(DAG.getUNDEF(Arg.VT));
2940 continue;
2941 }
2942
2943 CCValAssign &VA = ArgLocs[ArgIdx++];
2944 MVT VT = VA.getLocVT();
2945
2946 if (IsEntryFunc && VA.isMemLoc()) {
2947 VT = Ins[i].VT;
2948 EVT MemVT = VA.getLocVT();
2949
2950 const uint64_t Offset = VA.getLocMemOffset();
2951 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2952
2953 if (Arg.Flags.isByRef()) {
2954 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2955
2956 const GCNTargetMachine &TM =
2957 static_cast<const GCNTargetMachine &>(getTargetMachine());
2958 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2959 Arg.Flags.getPointerAddrSpace())) {
2962 }
2963
2964 InVals.push_back(Ptr);
2965 continue;
2966 }
2967
2968 SDValue NewArg;
2969 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2970 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2971 // In this case the argument is packed into the previous preload SGPR.
2972 int64_t AlignDownOffset = alignDown(Offset, 4);
2973 int64_t OffsetDiff = Offset - AlignDownOffset;
2974 EVT IntVT = MemVT.changeTypeToInteger();
2975
2979 Register Reg =
2980 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2981
2982 assert(Reg);
2983 Register VReg = MRI.getLiveInVirtReg(Reg);
2984 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2985
2986 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2987 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2988
2989 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2990 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2991 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2992 Ins[i].Flags.isSExt(), &Ins[i]);
2993
2994 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2995 } else {
2999 const SmallVectorImpl<MCRegister> &PreloadRegs =
3000 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3001
3002 SDValue Copy;
3003 if (PreloadRegs.size() == 1) {
3004 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3005 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3006 NewArg = DAG.getCopyFromReg(
3007 Chain, DL, VReg,
3009 TRI->getRegSizeInBits(*RC)));
3010
3011 } else {
3012 // If the kernarg alignment does not match the alignment of the SGPR
3013 // tuple RC that can accommodate this argument, it will be built up
3014 // via copies from from the individual SGPRs that the argument was
3015 // preloaded to.
3017 for (auto Reg : PreloadRegs) {
3018 Register VReg = MRI.getLiveInVirtReg(Reg);
3019 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3020 Elts.push_back(Copy);
3021 }
3022 NewArg =
3023 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3024 PreloadRegs.size()),
3025 DL, Elts);
3026 }
3027
3028 // If the argument was preloaded to multiple consecutive 32-bit
3029 // registers because of misalignment between addressable SGPR tuples
3030 // and the argument size, we can still assume that because of kernarg
3031 // segment alignment restrictions that NewArg's size is the same as
3032 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3033 // truncate since we cannot preload to less than a single SGPR and the
3034 // MemVT may be smaller.
3035 EVT MemVTInt =
3037 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3038 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3039
3040 NewArg = DAG.getBitcast(MemVT, NewArg);
3041 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3042 Ins[i].Flags.isSExt(), &Ins[i]);
3043 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3044 }
3045 } else {
3046 // Hidden arguments that are in the kernel signature must be preloaded
3047 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3048 // the argument list and is not preloaded.
3049 if (Arg.isOrigArg()) {
3050 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3051 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3052 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3053 *OrigArg->getParent(),
3054 "hidden argument in kernel signature was not preloaded",
3055 DL.getDebugLoc());
3056 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3057 }
3058 }
3059
3060 NewArg =
3061 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3062 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3063 }
3064 Chains.push_back(NewArg.getValue(1));
3065
3066 auto *ParamTy =
3067 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3069 ParamTy &&
3070 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3071 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3072 // On SI local pointers are just offsets into LDS, so they are always
3073 // less than 16-bits. On CI and newer they could potentially be
3074 // real pointers, so we can't guarantee their size.
3075 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3076 DAG.getValueType(MVT::i16));
3077 }
3078
3079 InVals.push_back(NewArg);
3080 continue;
3081 }
3082 if (!IsEntryFunc && VA.isMemLoc()) {
3083 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3084 InVals.push_back(Val);
3085 if (!Arg.Flags.isByVal())
3086 Chains.push_back(Val.getValue(1));
3087 continue;
3088 }
3089
3090 assert(VA.isRegLoc() && "Parameter must be in a register!");
3091
3092 Register Reg = VA.getLocReg();
3093 const TargetRegisterClass *RC = nullptr;
3094 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3095 RC = &AMDGPU::VGPR_32RegClass;
3096 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3097 RC = &AMDGPU::SGPR_32RegClass;
3098 else
3099 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3100 EVT ValVT = VA.getValVT();
3101
3102 Reg = MF.addLiveIn(Reg, RC);
3103 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3104
3105 if (Arg.Flags.isSRet()) {
3106 // The return object should be reasonably addressable.
3107
3108 // FIXME: This helps when the return is a real sret. If it is a
3109 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3110 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3111 unsigned NumBits =
3113 Val = DAG.getNode(
3114 ISD::AssertZext, DL, VT, Val,
3115 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3116 }
3117
3118 // If this is an 8 or 16-bit value, it is really passed promoted
3119 // to 32 bits. Insert an assert[sz]ext to capture this, then
3120 // truncate to the right size.
3121 switch (VA.getLocInfo()) {
3122 case CCValAssign::Full:
3123 break;
3124 case CCValAssign::BCvt:
3125 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3126 break;
3127 case CCValAssign::SExt:
3128 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3129 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3130 break;
3131 case CCValAssign::ZExt:
3132 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3133 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3134 break;
3135 case CCValAssign::AExt:
3136 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3137 break;
3138 default:
3139 llvm_unreachable("Unknown loc info!");
3140 }
3141
3142 InVals.push_back(Val);
3143 }
3144
3145 // Start adding system SGPRs.
3146 if (IsEntryFunc)
3147 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3148
3149 // DAG.getPass() returns nullptr when using new pass manager.
3150 // TODO: Use DAG.getMFAM() to access analysis result.
3151 if (DAG.getPass()) {
3152 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3153 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3154 }
3155
3156 unsigned StackArgSize = CCInfo.getStackSize();
3157 Info->setBytesInStackArgArea(StackArgSize);
3158
3159 return Chains.empty() ? Chain
3160 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3161}
3162
3163// TODO: If return values can't fit in registers, we should return as many as
3164// possible in registers before passing on stack.
3166 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3167 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3168 const Type *RetTy) const {
3169 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3170 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3171 // for shaders. Vector types should be explicitly handled by CC.
3172 if (AMDGPU::isEntryFunctionCC(CallConv))
3173 return true;
3174
3176 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3177 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3178 return false;
3179
3180 // We must use the stack if return would require unavailable registers.
3181 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3182 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3183 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3184 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3185 return false;
3186
3187 return true;
3188}
3189
3190SDValue
3192 bool isVarArg,
3194 const SmallVectorImpl<SDValue> &OutVals,
3195 const SDLoc &DL, SelectionDAG &DAG) const {
3198
3199 if (AMDGPU::isKernel(CallConv)) {
3200 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3201 OutVals, DL, DAG);
3202 }
3203
3204 bool IsShader = AMDGPU::isShader(CallConv);
3205
3206 Info->setIfReturnsVoid(Outs.empty());
3207 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3208
3209 // CCValAssign - represent the assignment of the return value to a location.
3212
3213 // CCState - Info about the registers and stack slots.
3214 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3215 *DAG.getContext());
3216
3217 // Analyze outgoing return values.
3218 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3219
3220 SDValue Glue;
3222 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3223
3224 // Copy the result values into the output registers.
3225 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3226 ++I, ++RealRVLocIdx) {
3227 CCValAssign &VA = RVLocs[I];
3228 assert(VA.isRegLoc() && "Can only return in registers!");
3229 // TODO: Partially return in registers if return values don't fit.
3230 SDValue Arg = OutVals[RealRVLocIdx];
3231
3232 // Copied from other backends.
3233 switch (VA.getLocInfo()) {
3234 case CCValAssign::Full:
3235 break;
3236 case CCValAssign::BCvt:
3237 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3238 break;
3239 case CCValAssign::SExt:
3240 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3241 break;
3242 case CCValAssign::ZExt:
3243 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3244 break;
3245 case CCValAssign::AExt:
3246 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3247 break;
3248 default:
3249 llvm_unreachable("Unknown loc info!");
3250 }
3251
3252 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3253 Glue = Chain.getValue(1);
3254 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3255 }
3256
3257 // FIXME: Does sret work properly?
3258 if (!Info->isEntryFunction()) {
3259 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3260 const MCPhysReg *I =
3261 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3262 if (I) {
3263 for (; *I; ++I) {
3264 if (AMDGPU::SReg_64RegClass.contains(*I))
3265 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3266 else if (AMDGPU::SReg_32RegClass.contains(*I))
3267 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3268 else
3269 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3270 }
3271 }
3272 }
3273
3274 // Update chain and glue.
3275 RetOps[0] = Chain;
3276 if (Glue.getNode())
3277 RetOps.push_back(Glue);
3278
3279 unsigned Opc = AMDGPUISD::ENDPGM;
3280 if (!IsWaveEnd)
3282 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3283}
3284
3286 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3287 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3288 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3289 SDValue ThisVal) const {
3290 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3291
3292 // Assign locations to each value returned by this call.
3294 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3295 *DAG.getContext());
3296 CCInfo.AnalyzeCallResult(Ins, RetCC);
3297
3298 // Copy all of the result registers out of their specified physreg.
3299 for (CCValAssign VA : RVLocs) {
3300 SDValue Val;
3301
3302 if (VA.isRegLoc()) {
3303 Val =
3304 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3305 Chain = Val.getValue(1);
3306 InGlue = Val.getValue(2);
3307 } else if (VA.isMemLoc()) {
3308 report_fatal_error("TODO: return values in memory");
3309 } else
3310 llvm_unreachable("unknown argument location type");
3311
3312 switch (VA.getLocInfo()) {
3313 case CCValAssign::Full:
3314 break;
3315 case CCValAssign::BCvt:
3316 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3317 break;
3318 case CCValAssign::ZExt:
3319 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3320 DAG.getValueType(VA.getValVT()));
3321 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3322 break;
3323 case CCValAssign::SExt:
3324 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3325 DAG.getValueType(VA.getValVT()));
3326 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3327 break;
3328 case CCValAssign::AExt:
3329 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3330 break;
3331 default:
3332 llvm_unreachable("Unknown loc info!");
3333 }
3334
3335 InVals.push_back(Val);
3336 }
3337
3338 return Chain;
3339}
3340
3341// Add code to pass special inputs required depending on used features separate
3342// from the explicit user arguments present in the IR.
3344 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3345 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3346 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3347 // If we don't have a call site, this was a call inserted by
3348 // legalization. These can never use special inputs.
3349 if (!CLI.CB)
3350 return;
3351
3352 SelectionDAG &DAG = CLI.DAG;
3353 const SDLoc &DL = CLI.DL;
3354 const Function &F = DAG.getMachineFunction().getFunction();
3355
3356 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3357 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3358
3359 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3361 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3362 // DAG.getPass() returns nullptr when using new pass manager.
3363 // TODO: Use DAG.getMFAM() to access analysis result.
3364 if (DAG.getPass()) {
3365 auto &ArgUsageInfo =
3367 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3368 }
3369 }
3370
3371 // TODO: Unify with private memory register handling. This is complicated by
3372 // the fact that at least in kernels, the input argument is not necessarily
3373 // in the same location as the input.
3374 // clang-format off
3375 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3377 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3378 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3379 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3380 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3381 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3382 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3383 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3384 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3385 };
3386 // clang-format on
3387
3388 for (auto [InputID, Attr] : ImplicitAttrs) {
3389 // If the callee does not use the attribute value, skip copying the value.
3390 if (CLI.CB->hasFnAttr(Attr))
3391 continue;
3392
3393 const auto [OutgoingArg, ArgRC, ArgTy] =
3394 CalleeArgInfo->getPreloadedValue(InputID);
3395 if (!OutgoingArg)
3396 continue;
3397
3398 const auto [IncomingArg, IncomingArgRC, Ty] =
3399 CallerArgInfo.getPreloadedValue(InputID);
3400 assert(IncomingArgRC == ArgRC);
3401
3402 // All special arguments are ints for now.
3403 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3404 SDValue InputReg;
3405
3406 if (IncomingArg) {
3407 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3408 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3409 // The implicit arg ptr is special because it doesn't have a corresponding
3410 // input for kernels, and is computed from the kernarg segment pointer.
3411 InputReg = getImplicitArgPtr(DAG, DL);
3412 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3413 std::optional<uint32_t> Id =
3415 if (Id.has_value()) {
3416 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3417 } else {
3418 InputReg = DAG.getUNDEF(ArgVT);
3419 }
3420 } else {
3421 // We may have proven the input wasn't needed, although the ABI is
3422 // requiring it. We just need to allocate the register appropriately.
3423 InputReg = DAG.getUNDEF(ArgVT);
3424 }
3425
3426 if (OutgoingArg->isRegister()) {
3427 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3428 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3429 report_fatal_error("failed to allocate implicit input argument");
3430 } else {
3431 unsigned SpecialArgOffset =
3432 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3433 SDValue ArgStore =
3434 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3435 MemOpChains.push_back(ArgStore);
3436 }
3437 }
3438
3439 // Pack workitem IDs into a single register or pass it as is if already
3440 // packed.
3441
3442 auto [OutgoingArg, ArgRC, Ty] =
3444 if (!OutgoingArg)
3445 std::tie(OutgoingArg, ArgRC, Ty) =
3447 if (!OutgoingArg)
3448 std::tie(OutgoingArg, ArgRC, Ty) =
3450 if (!OutgoingArg)
3451 return;
3452
3453 const ArgDescriptor *IncomingArgX = std::get<0>(
3455 const ArgDescriptor *IncomingArgY = std::get<0>(
3457 const ArgDescriptor *IncomingArgZ = std::get<0>(
3459
3460 SDValue InputReg;
3461 SDLoc SL;
3462
3463 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3464 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3465 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3466
3467 // If incoming ids are not packed we need to pack them.
3468 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3469 NeedWorkItemIDX) {
3470 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3471 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3472 } else {
3473 InputReg = DAG.getConstant(0, DL, MVT::i32);
3474 }
3475 }
3476
3477 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3478 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3479 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3480 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3481 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3482 InputReg = InputReg.getNode()
3483 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3484 : Y;
3485 }
3486
3487 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3488 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3489 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3490 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3491 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3492 InputReg = InputReg.getNode()
3493 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3494 : Z;
3495 }
3496
3497 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3498 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3499 // We're in a situation where the outgoing function requires the workitem
3500 // ID, but the calling function does not have it (e.g a graphics function
3501 // calling a C calling convention function). This is illegal, but we need
3502 // to produce something.
3503 InputReg = DAG.getUNDEF(MVT::i32);
3504 } else {
3505 // Workitem ids are already packed, any of present incoming arguments
3506 // will carry all required fields.
3507 ArgDescriptor IncomingArg =
3508 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3509 : IncomingArgY ? *IncomingArgY
3510 : *IncomingArgZ,
3511 ~0u);
3512 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3513 }
3514 }
3515
3516 if (OutgoingArg->isRegister()) {
3517 if (InputReg)
3518 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3519
3520 CCInfo.AllocateReg(OutgoingArg->getRegister());
3521 } else {
3522 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3523 if (InputReg) {
3524 SDValue ArgStore =
3525 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3526 MemOpChains.push_back(ArgStore);
3527 }
3528 }
3529}
3530
3532 return CC == CallingConv::Fast;
3533}
3534
3535/// Return true if we might ever do TCO for calls with this calling convention.
3537 switch (CC) {
3538 case CallingConv::C:
3540 return true;
3541 default:
3542 return canGuaranteeTCO(CC);
3543 }
3544}
3545
3547 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3549 const SmallVectorImpl<SDValue> &OutVals,
3550 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3551 if (AMDGPU::isChainCC(CalleeCC))
3552 return true;
3553
3554 if (!mayTailCallThisCC(CalleeCC))
3555 return false;
3556
3557 // For a divergent call target, we need to do a waterfall loop over the
3558 // possible callees which precludes us from using a simple jump.
3559 if (Callee->isDivergent())
3560 return false;
3561
3563 const Function &CallerF = MF.getFunction();
3564 CallingConv::ID CallerCC = CallerF.getCallingConv();
3566 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3567
3568 // Kernels aren't callable, and don't have a live in return address so it
3569 // doesn't make sense to do a tail call with entry functions.
3570 if (!CallerPreserved)
3571 return false;
3572
3573 bool CCMatch = CallerCC == CalleeCC;
3574
3576 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3577 return true;
3578 return false;
3579 }
3580
3581 // TODO: Can we handle var args?
3582 if (IsVarArg)
3583 return false;
3584
3585 for (const Argument &Arg : CallerF.args()) {
3586 if (Arg.hasByValAttr())
3587 return false;
3588 }
3589
3590 LLVMContext &Ctx = *DAG.getContext();
3591
3592 // Check that the call results are passed in the same way.
3593 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3594 CCAssignFnForCall(CalleeCC, IsVarArg),
3595 CCAssignFnForCall(CallerCC, IsVarArg)))
3596 return false;
3597
3598 // The callee has to preserve all registers the caller needs to preserve.
3599 if (!CCMatch) {
3600 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3601 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3602 return false;
3603 }
3604
3605 // Nothing more to check if the callee is taking no arguments.
3606 if (Outs.empty())
3607 return true;
3608
3610 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3611
3612 // FIXME: We are not allocating special input registers, so we will be
3613 // deciding based on incorrect register assignments.
3614 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3615
3616 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3617 // If the stack arguments for this call do not fit into our own save area then
3618 // the call cannot be made tail.
3619 // TODO: Is this really necessary?
3620 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3621 return false;
3622
3623 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3624 // FIXME: What about inreg arguments that end up passed in memory?
3625 if (!CCVA.isRegLoc())
3626 continue;
3627
3628 // If we are passing an argument in an SGPR, and the value is divergent,
3629 // this call requires a waterfall loop.
3630 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3631 LLVM_DEBUG(
3632 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3633 << printReg(CCVA.getLocReg(), TRI) << '\n');
3634 return false;
3635 }
3636 }
3637
3638 const MachineRegisterInfo &MRI = MF.getRegInfo();
3639 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3640}
3641
3643 if (!CI->isTailCall())
3644 return false;
3645
3646 const Function *ParentFn = CI->getParent()->getParent();
3648 return false;
3649 return true;
3650}
3651
3652// The wave scratch offset register is used as the global base pointer.
3654 SmallVectorImpl<SDValue> &InVals) const {
3655 CallingConv::ID CallConv = CLI.CallConv;
3656 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3657
3658 SelectionDAG &DAG = CLI.DAG;
3659
3660 TargetLowering::ArgListEntry RequestedExec;
3661 if (IsChainCallConv) {
3662 // The last argument should be the value that we need to put in EXEC.
3663 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3664 // don't treat it like the rest of the arguments.
3665 RequestedExec = CLI.Args.back();
3666 assert(RequestedExec.Node && "No node for EXEC");
3667
3668 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3669 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3670
3671 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3672 CLI.Outs.pop_back();
3673 CLI.OutVals.pop_back();
3674
3675 if (RequestedExec.Ty->isIntegerTy(64)) {
3676 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3677 CLI.Outs.pop_back();
3678 CLI.OutVals.pop_back();
3679 }
3680
3681 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3682 "Haven't popped all the pieces of the EXEC mask");
3683 }
3684
3685 const SDLoc &DL = CLI.DL;
3687 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3689 SDValue Chain = CLI.Chain;
3690 SDValue Callee = CLI.Callee;
3691 bool &IsTailCall = CLI.IsTailCall;
3692 bool IsVarArg = CLI.IsVarArg;
3693 bool IsSibCall = false;
3695
3696 if (Callee.isUndef() || isNullConstant(Callee)) {
3697 if (!CLI.IsTailCall) {
3698 for (ISD::InputArg &Arg : CLI.Ins)
3699 InVals.push_back(DAG.getUNDEF(Arg.VT));
3700 }
3701
3702 return Chain;
3703 }
3704
3705 if (IsVarArg) {
3706 return lowerUnhandledCall(CLI, InVals,
3707 "unsupported call to variadic function ");
3708 }
3709
3710 if (!CLI.CB)
3711 report_fatal_error("unsupported libcall legalization");
3712
3713 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3714 return lowerUnhandledCall(CLI, InVals,
3715 "unsupported required tail call to function ");
3716 }
3717
3718 if (IsTailCall) {
3719 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3720 Outs, OutVals, Ins, DAG);
3721 if (!IsTailCall &&
3722 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3723 report_fatal_error("failed to perform tail call elimination on a call "
3724 "site marked musttail or on llvm.amdgcn.cs.chain");
3725 }
3726
3727 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3728
3729 // A sibling call is one where we're under the usual C ABI and not planning
3730 // to change that but can still do a tail call:
3731 if (!TailCallOpt && IsTailCall)
3732 IsSibCall = true;
3733
3734 if (IsTailCall)
3735 ++NumTailCalls;
3736 }
3737
3740 SmallVector<SDValue, 8> MemOpChains;
3741
3742 // Analyze operands of the call, assigning locations to each operand.
3744 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3745 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3746
3747 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3748 // With a fixed ABI, allocate fixed registers before user arguments.
3749 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3750 }
3751
3752 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3753
3754 // Get a count of how many bytes are to be pushed on the stack.
3755 unsigned NumBytes = CCInfo.getStackSize();
3756
3757 if (IsSibCall) {
3758 // Since we're not changing the ABI to make this a tail call, the memory
3759 // operands are already available in the caller's incoming argument space.
3760 NumBytes = 0;
3761 }
3762
3763 // FPDiff is the byte offset of the call's argument area from the callee's.
3764 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3765 // by this amount for a tail call. In a sibling call it must be 0 because the
3766 // caller will deallocate the entire stack and the callee still expects its
3767 // arguments to begin at SP+0. Completely unused for non-tail calls.
3768 int32_t FPDiff = 0;
3769 MachineFrameInfo &MFI = MF.getFrameInfo();
3770 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3771
3772 // Adjust the stack pointer for the new arguments...
3773 // These operations are automatically eliminated by the prolog/epilog pass
3774 if (!IsSibCall)
3775 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3776
3777 if (!IsSibCall || IsChainCallConv) {
3778 if (!Subtarget->enableFlatScratch()) {
3779 SmallVector<SDValue, 4> CopyFromChains;
3780
3781 // In the HSA case, this should be an identity copy.
3782 SDValue ScratchRSrcReg =
3783 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3784 RegsToPass.emplace_back(IsChainCallConv
3785 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3786 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3787 ScratchRSrcReg);
3788 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3789 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3790 }
3791 }
3792
3793 const unsigned NumSpecialInputs = RegsToPass.size();
3794
3795 MVT PtrVT = MVT::i32;
3796
3797 // Walk the register/memloc assignments, inserting copies/loads.
3798 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3799 CCValAssign &VA = ArgLocs[i];
3800 SDValue Arg = OutVals[i];
3801
3802 // Promote the value if needed.
3803 switch (VA.getLocInfo()) {
3804 case CCValAssign::Full:
3805 break;
3806 case CCValAssign::BCvt:
3807 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3808 break;
3809 case CCValAssign::ZExt:
3810 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3811 break;
3812 case CCValAssign::SExt:
3813 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3814 break;
3815 case CCValAssign::AExt:
3816 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3817 break;
3818 case CCValAssign::FPExt:
3819 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3820 break;
3821 default:
3822 llvm_unreachable("Unknown loc info!");
3823 }
3824
3825 if (VA.isRegLoc()) {
3826 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3827 } else {
3828 assert(VA.isMemLoc());
3829
3830 SDValue DstAddr;
3831 MachinePointerInfo DstInfo;
3832
3833 unsigned LocMemOffset = VA.getLocMemOffset();
3834 int32_t Offset = LocMemOffset;
3835
3836 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3837 MaybeAlign Alignment;
3838
3839 if (IsTailCall) {
3840 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3841 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3842 : VA.getValVT().getStoreSize();
3843
3844 // FIXME: We can have better than the minimum byval required alignment.
3845 Alignment =
3846 Flags.isByVal()
3847 ? Flags.getNonZeroByValAlign()
3848 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3849
3850 Offset = Offset + FPDiff;
3851 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3852
3853 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3854 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3855
3856 // Make sure any stack arguments overlapping with where we're storing
3857 // are loaded before this eventual operation. Otherwise they'll be
3858 // clobbered.
3859
3860 // FIXME: Why is this really necessary? This seems to just result in a
3861 // lot of code to copy the stack and write them back to the same
3862 // locations, which are supposed to be immutable?
3863 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3864 } else {
3865 // Stores to the argument stack area are relative to the stack pointer.
3866 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3867 MVT::i32);
3868 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3869 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3870 Alignment =
3871 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3872 }
3873
3874 if (Outs[i].Flags.isByVal()) {
3875 SDValue SizeNode =
3876 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3877 SDValue Cpy =
3878 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3879 Outs[i].Flags.getNonZeroByValAlign(),
3880 /*isVol = */ false, /*AlwaysInline = */ true,
3881 /*CI=*/nullptr, std::nullopt, DstInfo,
3883
3884 MemOpChains.push_back(Cpy);
3885 } else {
3886 SDValue Store =
3887 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3888 MemOpChains.push_back(Store);
3889 }
3890 }
3891 }
3892
3893 if (!MemOpChains.empty())
3894 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3895
3896 SDValue ReadFirstLaneID =
3897 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3898
3899 SDValue TokenGlue;
3900 if (CLI.ConvergenceControlToken) {
3901 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3903 }
3904
3905 // Build a sequence of copy-to-reg nodes chained together with token chain
3906 // and flag operands which copy the outgoing args into the appropriate regs.
3907 SDValue InGlue;
3908
3909 unsigned ArgIdx = 0;
3910 for (auto [Reg, Val] : RegsToPass) {
3911 if (ArgIdx++ >= NumSpecialInputs &&
3912 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3913 // For chain calls, the inreg arguments are required to be
3914 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3915 // they are uniform.
3916 //
3917 // For other calls, if an inreg arguments is known to be uniform,
3918 // speculatively insert a readfirstlane in case it is in a VGPR.
3919 //
3920 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3921 // value, so let that continue to produce invalid code.
3922
3923 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3924 if (TokenGlue)
3925 ReadfirstlaneArgs.push_back(TokenGlue);
3927 ReadfirstlaneArgs);
3928 }
3929
3930 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3931 InGlue = Chain.getValue(1);
3932 }
3933
3934 // We don't usually want to end the call-sequence here because we would tidy
3935 // the frame up *after* the call, however in the ABI-changing tail-call case
3936 // we've carefully laid out the parameters so that when sp is reset they'll be
3937 // in the correct location.
3938 if (IsTailCall && !IsSibCall) {
3939 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3940 InGlue = Chain.getValue(1);
3941 }
3942
3943 std::vector<SDValue> Ops({Chain});
3944
3945 // Add a redundant copy of the callee global which will not be legalized, as
3946 // we need direct access to the callee later.
3947 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3948 const GlobalValue *GV = GSD->getGlobal();
3949 Ops.push_back(Callee);
3950 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3951 } else {
3952 if (IsTailCall) {
3953 // isEligibleForTailCallOptimization considered whether the call target is
3954 // divergent, but we may still end up with a uniform value in a VGPR.
3955 // Insert a readfirstlane just in case.
3956 SDValue ReadFirstLaneID =
3957 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3958
3959 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3960 if (TokenGlue)
3961 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3962 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3963 ReadfirstlaneArgs);
3964 }
3965
3966 Ops.push_back(Callee);
3967 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3968 }
3969
3970 if (IsTailCall) {
3971 // Each tail call may have to adjust the stack by a different amount, so
3972 // this information must travel along with the operation for eventual
3973 // consumption by emitEpilogue.
3974 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3975 }
3976
3977 if (IsChainCallConv)
3978 Ops.push_back(RequestedExec.Node);
3979
3980 // Add argument registers to the end of the list so that they are known live
3981 // into the call.
3982 for (auto &[Reg, Val] : RegsToPass)
3983 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3984
3985 // Add a register mask operand representing the call-preserved registers.
3986 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3987 assert(Mask && "Missing call preserved mask for calling convention");
3988 Ops.push_back(DAG.getRegisterMask(Mask));
3989
3990 if (SDValue Token = CLI.ConvergenceControlToken) {
3992 GlueOps.push_back(Token);
3993 if (InGlue)
3994 GlueOps.push_back(InGlue);
3995
3996 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3997 MVT::Glue, GlueOps),
3998 0);
3999 }
4000
4001 if (InGlue)
4002 Ops.push_back(InGlue);
4003
4004 // If we're doing a tall call, use a TC_RETURN here rather than an
4005 // actual call instruction.
4006 if (IsTailCall) {
4007 MFI.setHasTailCall();
4008 unsigned OPC = AMDGPUISD::TC_RETURN;
4009 switch (CallConv) {
4012 break;
4016 break;
4017 }
4018
4019 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4020 }
4021
4022 // Returns a chain and a flag for retval copy to use.
4023 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4024 Chain = Call.getValue(0);
4025 InGlue = Call.getValue(1);
4026
4027 uint64_t CalleePopBytes = NumBytes;
4028 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4029 if (!Ins.empty())
4030 InGlue = Chain.getValue(1);
4031
4032 // Handle result values, copying them out of physregs into vregs that we
4033 // return.
4034 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4035 InVals, /*IsThisReturn=*/false, SDValue());
4036}
4037
4038// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4039// except for:
4040// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4041// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4043 SelectionDAG &DAG) const {
4044 const MachineFunction &MF = DAG.getMachineFunction();
4046
4047 SDLoc dl(Op);
4048 EVT VT = Op.getValueType();
4049 SDValue Chain = Op.getOperand(0);
4050 Register SPReg = Info->getStackPtrOffsetReg();
4051
4052 // Chain the dynamic stack allocation so that it doesn't modify the stack
4053 // pointer when other instructions are using the stack.
4054 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4055
4056 SDValue Size = Op.getOperand(1);
4057 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4058 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4059
4060 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4062 "Stack grows upwards for AMDGPU");
4063
4064 Chain = BaseAddr.getValue(1);
4065 Align StackAlign = TFL->getStackAlign();
4066 if (Alignment > StackAlign) {
4067 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4068 << Subtarget->getWavefrontSizeLog2();
4069 uint64_t StackAlignMask = ScaledAlignment - 1;
4070 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4071 DAG.getConstant(StackAlignMask, dl, VT));
4072 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4073 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4074 }
4075
4076 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4077 SDValue NewSP;
4078 if (isa<ConstantSDNode>(Size)) {
4079 // For constant sized alloca, scale alloca size by wave-size
4080 SDValue ScaledSize = DAG.getNode(
4081 ISD::SHL, dl, VT, Size,
4082 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4083 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4084 } else {
4085 // For dynamic sized alloca, perform wave-wide reduction to get max of
4086 // alloca size(divergent) and then scale it by wave-size
4087 SDValue WaveReduction =
4088 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4089 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4090 Size, DAG.getConstant(0, dl, MVT::i32));
4091 SDValue ScaledSize = DAG.getNode(
4092 ISD::SHL, dl, VT, Size,
4093 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4094 NewSP =
4095 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4096 SDValue ReadFirstLaneID =
4097 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4098 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4099 NewSP);
4100 }
4101
4102 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4103 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4104
4105 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4106}
4107
4109 if (Op.getValueType() != MVT::i32)
4110 return Op; // Defer to cannot select error.
4111
4113 SDLoc SL(Op);
4114
4115 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4116
4117 // Convert from wave uniform to swizzled vector address. This should protect
4118 // from any edge cases where the stacksave result isn't directly used with
4119 // stackrestore.
4120 SDValue VectorAddress =
4121 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4122 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4123}
4124
4126 SelectionDAG &DAG) const {
4127 SDLoc SL(Op);
4128 assert(Op.getValueType() == MVT::i32);
4129
4130 uint32_t BothRoundHwReg =
4132 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4133
4134 SDValue IntrinID =
4135 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4136 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4137 Op.getOperand(0), IntrinID, GetRoundBothImm);
4138
4139 // There are two rounding modes, one for f32 and one for f64/f16. We only
4140 // report in the standard value range if both are the same.
4141 //
4142 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4143 // ties away from zero is not supported, and the other values are rotated by
4144 // 1.
4145 //
4146 // If the two rounding modes are not the same, report a target defined value.
4147
4148 // Mode register rounding mode fields:
4149 //
4150 // [1:0] Single-precision round mode.
4151 // [3:2] Double/Half-precision round mode.
4152 //
4153 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4154 //
4155 // Hardware Spec
4156 // Toward-0 3 0
4157 // Nearest Even 0 1
4158 // +Inf 1 2
4159 // -Inf 2 3
4160 // NearestAway0 N/A 4
4161 //
4162 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4163 // table we can index by the raw hardware mode.
4164 //
4165 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4166
4167 SDValue BitTable =
4169
4170 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4171 SDValue RoundModeTimesNumBits =
4172 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4173
4174 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4175 // knew only one mode was demanded.
4176 SDValue TableValue =
4177 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4178 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4179
4180 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4181 SDValue TableEntry =
4182 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4183
4184 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4185 // if it's an extended value.
4186 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4187 SDValue IsStandardValue =
4188 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4189 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4190 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4191 TableEntry, EnumOffset);
4192
4193 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4194}
4195
4197 SelectionDAG &DAG) const {
4198 SDLoc SL(Op);
4199
4200 SDValue NewMode = Op.getOperand(1);
4201 assert(NewMode.getValueType() == MVT::i32);
4202
4203 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4204 // hardware MODE.fp_round values.
4205 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4206 uint32_t ClampedVal = std::min(
4207 static_cast<uint32_t>(ConstMode->getZExtValue()),
4209 NewMode = DAG.getConstant(
4210 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4211 } else {
4212 // If we know the input can only be one of the supported standard modes in
4213 // the range 0-3, we can use a simplified mapping to hardware values.
4214 KnownBits KB = DAG.computeKnownBits(NewMode);
4215 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4216 // The supported standard values are 0-3. The extended values start at 8. We
4217 // need to offset by 4 if the value is in the extended range.
4218
4219 if (UseReducedTable) {
4220 // Truncate to the low 32-bits.
4221 SDValue BitTable = DAG.getConstant(
4222 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4223
4224 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4225 SDValue RoundModeTimesNumBits =
4226 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4227
4228 NewMode =
4229 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4230
4231 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4232 // the table extracted bits into inline immediates.
4233 } else {
4234 // table_index = umin(value, value - 4)
4235 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4236 SDValue BitTable =
4238
4239 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4240 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4241 SDValue IndexVal =
4242 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4243
4244 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4245 SDValue RoundModeTimesNumBits =
4246 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4247
4248 SDValue TableValue =
4249 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4250 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4251
4252 // No need to mask out the high bits since the setreg will ignore them
4253 // anyway.
4254 NewMode = TruncTable;
4255 }
4256
4257 // Insert a readfirstlane in case the value is a VGPR. We could do this
4258 // earlier and keep more operations scalar, but that interferes with
4259 // combining the source.
4260 SDValue ReadFirstLaneID =
4261 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4262 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4263 ReadFirstLaneID, NewMode);
4264 }
4265
4266 // N.B. The setreg will be later folded into s_round_mode on supported
4267 // targets.
4268 SDValue IntrinID =
4269 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4270 uint32_t BothRoundHwReg =
4272 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4273
4274 SDValue SetReg =
4275 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4276 IntrinID, RoundBothImm, NewMode);
4277
4278 return SetReg;
4279}
4280
4282 if (Op->isDivergent())
4283 return SDValue();
4284
4285 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4290 break;
4291 default:
4292 return SDValue();
4293 }
4294
4295 return Op;
4296}
4297
4298// Work around DAG legality rules only based on the result type.
4300 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4301 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4302 EVT SrcVT = Src.getValueType();
4303
4304 if (SrcVT.getScalarType() != MVT::bf16)
4305 return Op;
4306
4307 SDLoc SL(Op);
4308 SDValue BitCast =
4309 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4310
4311 EVT DstVT = Op.getValueType();
4312 if (IsStrict)
4313 llvm_unreachable("Need STRICT_BF16_TO_FP");
4314
4315 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4316}
4317
4319 SDLoc SL(Op);
4320 if (Op.getValueType() != MVT::i64)
4321 return Op;
4322
4323 uint32_t ModeHwReg =
4325 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4326 uint32_t TrapHwReg =
4328 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4329
4330 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4331 SDValue IntrinID =
4332 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4333 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4334 Op.getOperand(0), IntrinID, ModeHwRegImm);
4335 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4336 Op.getOperand(0), IntrinID, TrapHwRegImm);
4337 SDValue TokenReg =
4338 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4339 GetTrapReg.getValue(1));
4340
4341 SDValue CvtPtr =
4342 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4343 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4344
4345 return DAG.getMergeValues({Result, TokenReg}, SL);
4346}
4347
4349 SDLoc SL(Op);
4350 if (Op.getOperand(1).getValueType() != MVT::i64)
4351 return Op;
4352
4353 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4354 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4355 DAG.getConstant(0, SL, MVT::i32));
4356 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4357 DAG.getConstant(1, SL, MVT::i32));
4358
4359 SDValue ReadFirstLaneID =
4360 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4361 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4362 ReadFirstLaneID, NewModeReg);
4363 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4364 ReadFirstLaneID, NewTrapReg);
4365
4366 unsigned ModeHwReg =
4368 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4369 unsigned TrapHwReg =
4371 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4372
4373 SDValue IntrinID =
4374 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4375 SDValue SetModeReg =
4376 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4377 IntrinID, ModeHwRegImm, NewModeReg);
4378 SDValue SetTrapReg =
4379 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4380 IntrinID, TrapHwRegImm, NewTrapReg);
4381 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4382}
4383
4385 const MachineFunction &MF) const {
4387 .Case("m0", AMDGPU::M0)
4388 .Case("exec", AMDGPU::EXEC)
4389 .Case("exec_lo", AMDGPU::EXEC_LO)
4390 .Case("exec_hi", AMDGPU::EXEC_HI)
4391 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4392 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4393 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4394 .Default(Register());
4395
4396 if (Reg == AMDGPU::NoRegister) {
4398 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4399 }
4400
4401 if (!Subtarget->hasFlatScrRegister() &&
4402 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4403 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4404 "\" for subtarget."));
4405 }
4406
4407 switch (Reg) {
4408 case AMDGPU::M0:
4409 case AMDGPU::EXEC_LO:
4410 case AMDGPU::EXEC_HI:
4411 case AMDGPU::FLAT_SCR_LO:
4412 case AMDGPU::FLAT_SCR_HI:
4413 if (VT.getSizeInBits() == 32)
4414 return Reg;
4415 break;
4416 case AMDGPU::EXEC:
4417 case AMDGPU::FLAT_SCR:
4418 if (VT.getSizeInBits() == 64)
4419 return Reg;
4420 break;
4421 default:
4422 llvm_unreachable("missing register type checking");
4423 }
4424
4426 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4427}
4428
4429// If kill is not the last instruction, split the block so kill is always a
4430// proper terminator.
4433 MachineBasicBlock *BB) const {
4434 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4436 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4437 return SplitBB;
4438}
4439
4440// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4441// \p MI will be the only instruction in the loop body block. Otherwise, it will
4442// be the first instruction in the remainder block.
4443//
4444/// \returns { LoopBody, Remainder }
4445static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4449
4450 // To insert the loop we need to split the block. Move everything after this
4451 // point to a new block, and insert a new empty block between the two.
4453 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4455 ++MBBI;
4456
4457 MF->insert(MBBI, LoopBB);
4458 MF->insert(MBBI, RemainderBB);
4459
4460 LoopBB->addSuccessor(LoopBB);
4461 LoopBB->addSuccessor(RemainderBB);
4462
4463 // Move the rest of the block into a new block.
4464 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4465
4466 if (InstInLoop) {
4467 auto Next = std::next(I);
4468
4469 // Move instruction to loop body.
4470 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4471
4472 // Move the rest of the block.
4473 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4474 } else {
4475 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4476 }
4477
4478 MBB.addSuccessor(LoopBB);
4479
4480 return std::pair(LoopBB, RemainderBB);
4481}
4482
4483/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4485 MachineBasicBlock *MBB = MI.getParent();
4487 auto I = MI.getIterator();
4488 auto E = std::next(I);
4489
4490 // clang-format off
4491 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4492 .addImm(0);
4493 // clang-format on
4494
4495 MIBundleBuilder Bundler(*MBB, I, E);
4496 finalizeBundle(*MBB, Bundler.begin());
4497}
4498
4501 MachineBasicBlock *BB) const {
4502 const DebugLoc &DL = MI.getDebugLoc();
4503
4505
4507
4508 // Apparently kill flags are only valid if the def is in the same block?
4509 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4510 Src->setIsKill(false);
4511
4512 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4513
4514 MachineBasicBlock::iterator I = LoopBB->end();
4515
4516 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4518
4519 // Clear TRAP_STS.MEM_VIOL
4520 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4521 .addImm(0)
4522 .addImm(EncodedReg);
4523
4525
4526 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4527
4528 // Load and check TRAP_STS.MEM_VIOL
4529 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4530 .addImm(EncodedReg);
4531
4532 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4533 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4534 .addReg(Reg, RegState::Kill)
4535 .addImm(0);
4536 // clang-format off
4537 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4538 .addMBB(LoopBB);
4539 // clang-format on
4540
4541 return RemainderBB;
4542}
4543
4544// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4545// wavefront. If the value is uniform and just happens to be in a VGPR, this
4546// will only do one iteration. In the worst case, this will loop 64 times.
4547//
4548// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4551 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4552 const DebugLoc &DL, const MachineOperand &Idx,
4553 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4554 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4555 Register &SGPRIdxReg) {
4556
4557 MachineFunction *MF = OrigBB.getParent();
4558 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4559 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4561
4562 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4563 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4564 Register NewExec = MRI.createVirtualRegister(BoolRC);
4565 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4566 Register CondReg = MRI.createVirtualRegister(BoolRC);
4567
4568 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4569 .addReg(InitReg)
4570 .addMBB(&OrigBB)
4571 .addReg(ResultReg)
4572 .addMBB(&LoopBB);
4573
4574 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4575 .addReg(InitSaveExecReg)
4576 .addMBB(&OrigBB)
4577 .addReg(NewExec)
4578 .addMBB(&LoopBB);
4579
4580 // Read the next variant <- also loop target.
4581 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4582 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4583
4584 // Compare the just read M0 value to all possible Idx values.
4585 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4586 .addReg(CurrentIdxReg)
4587 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4588
4589 // Update EXEC, save the original EXEC value to VCC.
4590 BuildMI(LoopBB, I, DL,
4591 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4592 : AMDGPU::S_AND_SAVEEXEC_B64),
4593 NewExec)
4594 .addReg(CondReg, RegState::Kill);
4595
4596 MRI.setSimpleHint(NewExec, CondReg);
4597
4598 if (UseGPRIdxMode) {
4599 if (Offset == 0) {
4600 SGPRIdxReg = CurrentIdxReg;
4601 } else {
4602 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4603 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4604 .addReg(CurrentIdxReg, RegState::Kill)
4605 .addImm(Offset);
4606 }
4607 } else {
4608 // Move index from VCC into M0
4609 if (Offset == 0) {
4610 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4611 .addReg(CurrentIdxReg, RegState::Kill);
4612 } else {
4613 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4614 .addReg(CurrentIdxReg, RegState::Kill)
4615 .addImm(Offset);
4616 }
4617 }
4618
4619 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4620 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4621 MachineInstr *InsertPt =
4622 BuildMI(LoopBB, I, DL,
4623 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4624 : AMDGPU::S_XOR_B64_term),
4625 Exec)
4626 .addReg(Exec)
4627 .addReg(NewExec);
4628
4629 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4630 // s_cbranch_scc0?
4631
4632 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4633 // clang-format off
4634 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4635 .addMBB(&LoopBB);
4636 // clang-format on
4637
4638 return InsertPt->getIterator();
4639}
4640
4641// This has slightly sub-optimal regalloc when the source vector is killed by
4642// the read. The register allocator does not understand that the kill is
4643// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4644// subregister from it, using 1 more VGPR than necessary. This was saved when
4645// this was expanded after register allocation.
4648 unsigned InitResultReg, unsigned PhiReg, int Offset,
4649 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4651 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4652 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4654 const DebugLoc &DL = MI.getDebugLoc();
4656
4657 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4658 Register DstReg = MI.getOperand(0).getReg();
4659 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4660 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4661 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4662 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4663
4664 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4665
4666 // Save the EXEC mask
4667 // clang-format off
4668 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4669 .addReg(Exec);
4670 // clang-format on
4671
4672 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4673
4674 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4675
4676 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4677 InitResultReg, DstReg, PhiReg, TmpExec,
4678 Offset, UseGPRIdxMode, SGPRIdxReg);
4679
4680 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4682 ++MBBI;
4683 MF->insert(MBBI, LandingPad);
4684 LoopBB->removeSuccessor(RemainderBB);
4685 LandingPad->addSuccessor(RemainderBB);
4686 LoopBB->addSuccessor(LandingPad);
4687 MachineBasicBlock::iterator First = LandingPad->begin();
4688 // clang-format off
4689 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4690 .addReg(SaveExec);
4691 // clang-format on
4692
4693 return InsPt;
4694}
4695
4696// Returns subreg index, offset
4697static std::pair<unsigned, int>
4699 const TargetRegisterClass *SuperRC, unsigned VecReg,
4700 int Offset) {
4701 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4702
4703 // Skip out of bounds offsets, or else we would end up using an undefined
4704 // register.
4705 if (Offset >= NumElts || Offset < 0)
4706 return std::pair(AMDGPU::sub0, Offset);
4707
4708 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4709}
4710
4713 int Offset) {
4714 MachineBasicBlock *MBB = MI.getParent();
4715 const DebugLoc &DL = MI.getDebugLoc();
4717
4718 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4719
4720 assert(Idx->getReg() != AMDGPU::NoRegister);
4721
4722 if (Offset == 0) {
4723 // clang-format off
4724 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4725 .add(*Idx);
4726 // clang-format on
4727 } else {
4728 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4729 .add(*Idx)
4730 .addImm(Offset);
4731 }
4732}
4733
4736 int Offset) {
4737 MachineBasicBlock *MBB = MI.getParent();
4738 const DebugLoc &DL = MI.getDebugLoc();
4740
4741 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4742
4743 if (Offset == 0)
4744 return Idx->getReg();
4745
4746 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4747 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4748 .add(*Idx)
4749 .addImm(Offset);
4750 return Tmp;
4751}
4752
4755 const GCNSubtarget &ST) {
4756 const SIInstrInfo *TII = ST.getInstrInfo();
4757 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4760
4761 Register Dst = MI.getOperand(0).getReg();
4762 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4763 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4764 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4765
4766 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4767 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4768
4769 unsigned SubReg;
4770 std::tie(SubReg, Offset) =
4771 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4772
4773 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4774
4775 // Check for a SGPR index.
4776 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4778 const DebugLoc &DL = MI.getDebugLoc();
4779
4780 if (UseGPRIdxMode) {
4781 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4782 // to avoid interfering with other uses, so probably requires a new
4783 // optimization pass.
4785
4786 const MCInstrDesc &GPRIDXDesc =
4787 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4788 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4789 .addReg(SrcReg)
4790 .addReg(Idx)
4791 .addImm(SubReg);
4792 } else {
4794
4795 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4796 .addReg(SrcReg, 0, SubReg)
4797 .addReg(SrcReg, RegState::Implicit);
4798 }
4799
4800 MI.eraseFromParent();
4801
4802 return &MBB;
4803 }
4804
4805 // Control flow needs to be inserted if indexing with a VGPR.
4806 const DebugLoc &DL = MI.getDebugLoc();
4808
4809 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4810 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4811
4812 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4813
4814 Register SGPRIdxReg;
4815 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4816 UseGPRIdxMode, SGPRIdxReg);
4817
4818 MachineBasicBlock *LoopBB = InsPt->getParent();
4819
4820 if (UseGPRIdxMode) {
4821 const MCInstrDesc &GPRIDXDesc =
4822 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4823
4824 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4825 .addReg(SrcReg)
4826 .addReg(SGPRIdxReg)
4827 .addImm(SubReg);
4828 } else {
4829 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4830 .addReg(SrcReg, 0, SubReg)
4831 .addReg(SrcReg, RegState::Implicit);
4832 }
4833
4834 MI.eraseFromParent();
4835
4836 return LoopBB;
4837}
4838
4841 const GCNSubtarget &ST) {
4842 const SIInstrInfo *TII = ST.getInstrInfo();
4843 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4846
4847 Register Dst = MI.getOperand(0).getReg();
4848 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4849 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4850 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4851 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4852 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4853 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4854
4855 // This can be an immediate, but will be folded later.
4856 assert(Val->getReg());
4857
4858 unsigned SubReg;
4859 std::tie(SubReg, Offset) =
4860 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4861 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4862
4863 if (Idx->getReg() == AMDGPU::NoRegister) {
4865 const DebugLoc &DL = MI.getDebugLoc();
4866
4867 assert(Offset == 0);
4868
4869 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4870 .add(*SrcVec)
4871 .add(*Val)
4872 .addImm(SubReg);
4873
4874 MI.eraseFromParent();
4875 return &MBB;
4876 }
4877
4878 // Check for a SGPR index.
4879 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4881 const DebugLoc &DL = MI.getDebugLoc();
4882
4883 if (UseGPRIdxMode) {
4885
4886 const MCInstrDesc &GPRIDXDesc =
4887 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4888 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4889 .addReg(SrcVec->getReg())
4890 .add(*Val)
4891 .addReg(Idx)
4892 .addImm(SubReg);
4893 } else {
4895
4896 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4897 TRI.getRegSizeInBits(*VecRC), 32, false);
4898 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4899 .addReg(SrcVec->getReg())
4900 .add(*Val)
4901 .addImm(SubReg);
4902 }
4903 MI.eraseFromParent();
4904 return &MBB;
4905 }
4906
4907 // Control flow needs to be inserted if indexing with a VGPR.
4908 if (Val->isReg())
4909 MRI.clearKillFlags(Val->getReg());
4910
4911 const DebugLoc &DL = MI.getDebugLoc();
4912
4913 Register PhiReg = MRI.createVirtualRegister(VecRC);
4914
4915 Register SGPRIdxReg;
4916 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4917 UseGPRIdxMode, SGPRIdxReg);
4918 MachineBasicBlock *LoopBB = InsPt->getParent();
4919
4920 if (UseGPRIdxMode) {
4921 const MCInstrDesc &GPRIDXDesc =
4922 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4923
4924 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4925 .addReg(PhiReg)
4926 .add(*Val)
4927 .addReg(SGPRIdxReg)
4928 .addImm(SubReg);
4929 } else {
4930 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4931 TRI.getRegSizeInBits(*VecRC), 32, false);
4932 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4933 .addReg(PhiReg)
4934 .add(*Val)
4935 .addImm(SubReg);
4936 }
4937
4938 MI.eraseFromParent();
4939 return LoopBB;
4940}
4941
4944 const GCNSubtarget &ST,
4945 unsigned Opc) {
4947 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4948 const DebugLoc &DL = MI.getDebugLoc();
4949 const SIInstrInfo *TII = ST.getInstrInfo();
4950
4951 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4952 Register SrcReg = MI.getOperand(1).getReg();
4953 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4954 Register DstReg = MI.getOperand(0).getReg();
4955 MachineBasicBlock *RetBB = nullptr;
4956 if (isSGPR) {
4957 // These operations with a uniform value i.e. SGPR are idempotent.
4958 // Reduced value will be same as given sgpr.
4959 // clang-format off
4960 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4961 .addReg(SrcReg);
4962 // clang-format on
4963 RetBB = &BB;
4964 } else {
4965 // TODO: Implement DPP Strategy and switch based on immediate strategy
4966 // operand. For now, for all the cases (default, Iterative and DPP we use
4967 // iterative approach by default.)
4968
4969 // To reduce the VGPR using iterative approach, we need to iterate
4970 // over all the active lanes. Lowering consists of ComputeLoop,
4971 // which iterate over only active lanes. We use copy of EXEC register
4972 // as induction variable and every active lane modifies it using bitset0
4973 // so that we will get the next active lane for next iteration.
4975 Register SrcReg = MI.getOperand(1).getReg();
4976
4977 // Create Control flow for loop
4978 // Split MI's Machine Basic block into For loop
4979 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4980
4981 // Create virtual registers required for lowering.
4982 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4983 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4984 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4985 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4986
4987 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4988 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4989 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4990
4991 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4992 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4993
4994 bool IsWave32 = ST.isWave32();
4995 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4996 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4997
4998 // Create initail values of induction variable from Exec, Accumulator and
4999 // insert branch instr to newly created ComputeBlockk
5000 uint32_t InitalValue =
5001 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5002 auto TmpSReg =
5003 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5004 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5005 .addImm(InitalValue);
5006 // clang-format off
5007 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5008 .addMBB(ComputeLoop);
5009 // clang-format on
5010
5011 // Start constructing ComputeLoop
5012 I = ComputeLoop->end();
5013 auto Accumulator =
5014 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5015 .addReg(InitalValReg)
5016 .addMBB(&BB);
5017 auto ActiveBits =
5018 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5019 .addReg(TmpSReg->getOperand(0).getReg())
5020 .addMBB(&BB);
5021
5022 // Perform the computations
5023 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5024 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5025 .addReg(ActiveBits->getOperand(0).getReg());
5026 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5027 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5028 .addReg(SrcReg)
5029 .addReg(FF1->getOperand(0).getReg());
5030 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5031 .addReg(Accumulator->getOperand(0).getReg())
5032 .addReg(LaneValue->getOperand(0).getReg());
5033
5034 // Manipulate the iterator to get the next active lane
5035 unsigned BITSETOpc =
5036 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5037 auto NewActiveBits =
5038 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5039 .addReg(FF1->getOperand(0).getReg())
5040 .addReg(ActiveBits->getOperand(0).getReg());
5041
5042 // Add phi nodes
5043 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5044 .addMBB(ComputeLoop);
5045 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5046 .addMBB(ComputeLoop);
5047
5048 // Creating branching
5049 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5050 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5051 .addReg(NewActiveBits->getOperand(0).getReg())
5052 .addImm(0);
5053 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5054 .addMBB(ComputeLoop);
5055
5056 RetBB = ComputeEnd;
5057 }
5058 MI.eraseFromParent();
5059 return RetBB;
5060}
5061
5064 MachineBasicBlock *BB) const {
5065
5067 MachineFunction *MF = BB->getParent();
5069
5070 switch (MI.getOpcode()) {
5071 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5072 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5073 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5074 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5075 case AMDGPU::S_UADDO_PSEUDO:
5076 case AMDGPU::S_USUBO_PSEUDO: {
5077 const DebugLoc &DL = MI.getDebugLoc();
5078 MachineOperand &Dest0 = MI.getOperand(0);
5079 MachineOperand &Dest1 = MI.getOperand(1);
5080 MachineOperand &Src0 = MI.getOperand(2);
5081 MachineOperand &Src1 = MI.getOperand(3);
5082
5083 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5084 ? AMDGPU::S_ADD_I32
5085 : AMDGPU::S_SUB_I32;
5086 // clang-format off
5087 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5088 .add(Src0)
5089 .add(Src1);
5090 // clang-format on
5091
5092 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5093 .addImm(1)
5094 .addImm(0);
5095
5096 MI.eraseFromParent();
5097 return BB;
5098 }
5099 case AMDGPU::S_ADD_U64_PSEUDO:
5100 case AMDGPU::S_SUB_U64_PSEUDO: {
5101 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5102 // For GFX12, we emit s_add_u64 and s_sub_u64.
5103 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5105 const DebugLoc &DL = MI.getDebugLoc();
5106 MachineOperand &Dest = MI.getOperand(0);
5107 MachineOperand &Src0 = MI.getOperand(1);
5108 MachineOperand &Src1 = MI.getOperand(2);
5109 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5110 if (Subtarget->hasScalarAddSub64()) {
5111 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5112 // clang-format off
5113 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5114 .add(Src0)
5115 .add(Src1);
5116 // clang-format on
5117 } else {
5118 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5119 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5120
5121 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5122 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5123
5124 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5125 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5126 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5127 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5128
5129 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5130 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5131 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5132 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5133
5134 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5135 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5136 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5137 .add(Src0Sub0)
5138 .add(Src1Sub0);
5139 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5140 .add(Src0Sub1)
5141 .add(Src1Sub1);
5142 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5143 .addReg(DestSub0)
5144 .addImm(AMDGPU::sub0)
5145 .addReg(DestSub1)
5146 .addImm(AMDGPU::sub1);
5147 }
5148 MI.eraseFromParent();
5149 return BB;
5150 }
5151 case AMDGPU::V_ADD_U64_PSEUDO:
5152 case AMDGPU::V_SUB_U64_PSEUDO: {
5154 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5155 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5156 const DebugLoc &DL = MI.getDebugLoc();
5157
5158 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5159
5160 MachineOperand &Dest = MI.getOperand(0);
5161 MachineOperand &Src0 = MI.getOperand(1);
5162 MachineOperand &Src1 = MI.getOperand(2);
5163
5164 if (IsAdd && ST.hasLshlAddB64()) {
5165 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5166 Dest.getReg())
5167 .add(Src0)
5168 .addImm(0)
5169 .add(Src1);
5170 TII->legalizeOperands(*Add);
5171 MI.eraseFromParent();
5172 return BB;
5173 }
5174
5175 const auto *CarryRC = TRI->getWaveMaskRegClass();
5176
5177 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5178 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5179
5180 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5181 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5182
5183 const TargetRegisterClass *Src0RC = Src0.isReg()
5184 ? MRI.getRegClass(Src0.getReg())
5185 : &AMDGPU::VReg_64RegClass;
5186 const TargetRegisterClass *Src1RC = Src1.isReg()
5187 ? MRI.getRegClass(Src1.getReg())
5188 : &AMDGPU::VReg_64RegClass;
5189
5190 const TargetRegisterClass *Src0SubRC =
5191 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5192 const TargetRegisterClass *Src1SubRC =
5193 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5194
5195 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5196 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5197 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5198 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5199
5200 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5201 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5202 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5203 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5204
5205 unsigned LoOpc =
5206 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5207 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5208 .addReg(CarryReg, RegState::Define)
5209 .add(SrcReg0Sub0)
5210 .add(SrcReg1Sub0)
5211 .addImm(0); // clamp bit
5212
5213 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5214 MachineInstr *HiHalf =
5215 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5216 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5217 .add(SrcReg0Sub1)
5218 .add(SrcReg1Sub1)
5219 .addReg(CarryReg, RegState::Kill)
5220 .addImm(0); // clamp bit
5221
5222 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5223 .addReg(DestSub0)
5224 .addImm(AMDGPU::sub0)
5225 .addReg(DestSub1)
5226 .addImm(AMDGPU::sub1);
5227 TII->legalizeOperands(*LoHalf);
5228 TII->legalizeOperands(*HiHalf);
5229 MI.eraseFromParent();
5230 return BB;
5231 }
5232 case AMDGPU::S_ADD_CO_PSEUDO:
5233 case AMDGPU::S_SUB_CO_PSEUDO: {
5234 // This pseudo has a chance to be selected
5235 // only from uniform add/subcarry node. All the VGPR operands
5236 // therefore assumed to be splat vectors.
5238 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5239 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5241 const DebugLoc &DL = MI.getDebugLoc();
5242 MachineOperand &Dest = MI.getOperand(0);
5243 MachineOperand &CarryDest = MI.getOperand(1);
5244 MachineOperand &Src0 = MI.getOperand(2);
5245 MachineOperand &Src1 = MI.getOperand(3);
5246 MachineOperand &Src2 = MI.getOperand(4);
5247 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5248 ? AMDGPU::S_ADDC_U32
5249 : AMDGPU::S_SUBB_U32;
5250 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5251 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5252 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5253 .addReg(Src0.getReg());
5254 Src0.setReg(RegOp0);
5255 }
5256 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5257 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5258 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5259 .addReg(Src1.getReg());
5260 Src1.setReg(RegOp1);
5261 }
5262 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5263 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5264 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5265 .addReg(Src2.getReg());
5266 Src2.setReg(RegOp2);
5267 }
5268
5269 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5270 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5271 assert(WaveSize == 64 || WaveSize == 32);
5272
5273 if (WaveSize == 64) {
5274 if (ST.hasScalarCompareEq64()) {
5275 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5276 .addReg(Src2.getReg())
5277 .addImm(0);
5278 } else {
5279 const TargetRegisterClass *SubRC =
5280 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5281 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5282 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5283 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5284 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5285 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5286
5287 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5288 .add(Src2Sub0)
5289 .add(Src2Sub1);
5290
5291 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5292 .addReg(Src2_32, RegState::Kill)
5293 .addImm(0);
5294 }
5295 } else {
5296 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5297 .addReg(Src2.getReg())
5298 .addImm(0);
5299 }
5300
5301 // clang-format off
5302 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5303 .add(Src0)
5304 .add(Src1);
5305 // clang-format on
5306
5307 unsigned SelOpc =
5308 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5309
5310 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5311 .addImm(-1)
5312 .addImm(0);
5313
5314 MI.eraseFromParent();
5315 return BB;
5316 }
5317 case AMDGPU::SI_INIT_M0: {
5318 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5319 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5320 .add(MI.getOperand(0));
5321 MI.eraseFromParent();
5322 return BB;
5323 }
5324 case AMDGPU::GET_GROUPSTATICSIZE: {
5325 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5326 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5327 DebugLoc DL = MI.getDebugLoc();
5328 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5329 .add(MI.getOperand(0))
5330 .addImm(MFI->getLDSSize());
5331 MI.eraseFromParent();
5332 return BB;
5333 }
5334 case AMDGPU::GET_SHADERCYCLESHILO: {
5337 const DebugLoc &DL = MI.getDebugLoc();
5338 // The algorithm is:
5339 //
5340 // hi1 = getreg(SHADER_CYCLES_HI)
5341 // lo1 = getreg(SHADER_CYCLES_LO)
5342 // hi2 = getreg(SHADER_CYCLES_HI)
5343 //
5344 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5345 // Otherwise there was overflow and the result is hi2:0. In both cases the
5346 // result should represent the actual time at some point during the sequence
5347 // of three getregs.
5348 using namespace AMDGPU::Hwreg;
5349 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5350 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5351 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5352 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5353 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5354 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5355 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5356 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5357 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5358 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5359 .addReg(RegHi1)
5360 .addReg(RegHi2);
5361 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5362 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5363 .addReg(RegLo1)
5364 .addImm(0);
5365 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5366 .add(MI.getOperand(0))
5367 .addReg(RegLo)
5368 .addImm(AMDGPU::sub0)
5369 .addReg(RegHi2)
5370 .addImm(AMDGPU::sub1);
5371 MI.eraseFromParent();
5372 return BB;
5373 }
5374 case AMDGPU::SI_INDIRECT_SRC_V1:
5375 case AMDGPU::SI_INDIRECT_SRC_V2:
5376 case AMDGPU::SI_INDIRECT_SRC_V4:
5377 case AMDGPU::SI_INDIRECT_SRC_V8:
5378 case AMDGPU::SI_INDIRECT_SRC_V9:
5379 case AMDGPU::SI_INDIRECT_SRC_V10:
5380 case AMDGPU::SI_INDIRECT_SRC_V11:
5381 case AMDGPU::SI_INDIRECT_SRC_V12:
5382 case AMDGPU::SI_INDIRECT_SRC_V16:
5383 case AMDGPU::SI_INDIRECT_SRC_V32:
5384 return emitIndirectSrc(MI, *BB, *getSubtarget());
5385 case AMDGPU::SI_INDIRECT_DST_V1:
5386 case AMDGPU::SI_INDIRECT_DST_V2:
5387 case AMDGPU::SI_INDIRECT_DST_V4:
5388 case AMDGPU::SI_INDIRECT_DST_V8:
5389 case AMDGPU::SI_INDIRECT_DST_V9:
5390 case AMDGPU::SI_INDIRECT_DST_V10:
5391 case AMDGPU::SI_INDIRECT_DST_V11:
5392 case AMDGPU::SI_INDIRECT_DST_V12:
5393 case AMDGPU::SI_INDIRECT_DST_V16:
5394 case AMDGPU::SI_INDIRECT_DST_V32:
5395 return emitIndirectDst(MI, *BB, *getSubtarget());
5396 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5397 case AMDGPU::SI_KILL_I1_PSEUDO:
5398 return splitKillBlock(MI, BB);
5399 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5401 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5402 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5403
5404 Register Dst = MI.getOperand(0).getReg();
5405 const MachineOperand &Src0 = MI.getOperand(1);
5406 const MachineOperand &Src1 = MI.getOperand(2);
5407 const DebugLoc &DL = MI.getDebugLoc();
5408 Register SrcCond = MI.getOperand(3).getReg();
5409
5410 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5411 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5412 const auto *CondRC = TRI->getWaveMaskRegClass();
5413 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5414
5415 const TargetRegisterClass *Src0RC = Src0.isReg()
5416 ? MRI.getRegClass(Src0.getReg())
5417 : &AMDGPU::VReg_64RegClass;
5418 const TargetRegisterClass *Src1RC = Src1.isReg()
5419 ? MRI.getRegClass(Src1.getReg())
5420 : &AMDGPU::VReg_64RegClass;
5421
5422 const TargetRegisterClass *Src0SubRC =
5423 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5424 const TargetRegisterClass *Src1SubRC =
5425 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5426
5427 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5428 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5429 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5430 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5431
5432 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5433 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5434 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5435 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5436
5437 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5438 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5439 .addImm(0)
5440 .add(Src0Sub0)
5441 .addImm(0)
5442 .add(Src1Sub0)
5443 .addReg(SrcCondCopy);
5444 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5445 .addImm(0)
5446 .add(Src0Sub1)
5447 .addImm(0)
5448 .add(Src1Sub1)
5449 .addReg(SrcCondCopy);
5450
5451 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5452 .addReg(DstLo)
5453 .addImm(AMDGPU::sub0)
5454 .addReg(DstHi)
5455 .addImm(AMDGPU::sub1);
5456 MI.eraseFromParent();
5457 return BB;
5458 }
5459 case AMDGPU::SI_BR_UNDEF: {
5461 const DebugLoc &DL = MI.getDebugLoc();
5462 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5463 .add(MI.getOperand(0));
5464 Br->getOperand(1).setIsUndef(); // read undef SCC
5465 MI.eraseFromParent();
5466 return BB;
5467 }
5468 case AMDGPU::ADJCALLSTACKUP:
5469 case AMDGPU::ADJCALLSTACKDOWN: {
5471 MachineInstrBuilder MIB(*MF, &MI);
5472 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5473 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5474 return BB;
5475 }
5476 case AMDGPU::SI_CALL_ISEL: {
5478 const DebugLoc &DL = MI.getDebugLoc();
5479
5480 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5481
5483 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5484
5485 for (const MachineOperand &MO : MI.operands())
5486 MIB.add(MO);
5487
5488 MIB.cloneMemRefs(MI);
5489 MI.eraseFromParent();
5490 return BB;
5491 }
5492 case AMDGPU::V_ADD_CO_U32_e32:
5493 case AMDGPU::V_SUB_CO_U32_e32:
5494 case AMDGPU::V_SUBREV_CO_U32_e32: {
5495 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5496 const DebugLoc &DL = MI.getDebugLoc();
5497 unsigned Opc = MI.getOpcode();
5498
5499 bool NeedClampOperand = false;
5500 if (TII->pseudoToMCOpcode(Opc) == -1) {
5501 Opc = AMDGPU::getVOPe64(Opc);
5502 NeedClampOperand = true;
5503 }
5504
5505 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5506 if (TII->isVOP3(*I)) {
5507 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5508 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5509 I.addReg(TRI->getVCC(), RegState::Define);
5510 }
5511 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5512 if (NeedClampOperand)
5513 I.addImm(0); // clamp bit for e64 encoding
5514
5515 TII->legalizeOperands(*I);
5516
5517 MI.eraseFromParent();
5518 return BB;
5519 }
5520 case AMDGPU::V_ADDC_U32_e32:
5521 case AMDGPU::V_SUBB_U32_e32:
5522 case AMDGPU::V_SUBBREV_U32_e32:
5523 // These instructions have an implicit use of vcc which counts towards the
5524 // constant bus limit.
5525 TII->legalizeOperands(MI);
5526 return BB;
5527 case AMDGPU::DS_GWS_INIT:
5528 case AMDGPU::DS_GWS_SEMA_BR:
5529 case AMDGPU::DS_GWS_BARRIER:
5530 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5531 [[fallthrough]];
5532 case AMDGPU::DS_GWS_SEMA_V:
5533 case AMDGPU::DS_GWS_SEMA_P:
5534 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5535 // A s_waitcnt 0 is required to be the instruction immediately following.
5536 if (getSubtarget()->hasGWSAutoReplay()) {
5538 return BB;
5539 }
5540
5541 return emitGWSMemViolTestLoop(MI, BB);
5542 case AMDGPU::S_SETREG_B32: {
5543 // Try to optimize cases that only set the denormal mode or rounding mode.
5544 //
5545 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5546 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5547 // instead.
5548 //
5549 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5550 // allow you to have a no side effect instruction in the output of a
5551 // sideeffecting pattern.
5552 auto [ID, Offset, Width] =
5553 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5555 return BB;
5556
5557 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5558 const unsigned SetMask = WidthMask << Offset;
5559
5560 if (getSubtarget()->hasDenormModeInst()) {
5561 unsigned SetDenormOp = 0;
5562 unsigned SetRoundOp = 0;
5563
5564 // The dedicated instructions can only set the whole denorm or round mode
5565 // at once, not a subset of bits in either.
5566 if (SetMask ==
5568 // If this fully sets both the round and denorm mode, emit the two
5569 // dedicated instructions for these.
5570 SetRoundOp = AMDGPU::S_ROUND_MODE;
5571 SetDenormOp = AMDGPU::S_DENORM_MODE;
5572 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5573 SetRoundOp = AMDGPU::S_ROUND_MODE;
5574 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5575 SetDenormOp = AMDGPU::S_DENORM_MODE;
5576 }
5577
5578 if (SetRoundOp || SetDenormOp) {
5580 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5581 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5582 unsigned ImmVal = Def->getOperand(1).getImm();
5583 if (SetRoundOp) {
5584 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5585 .addImm(ImmVal & 0xf);
5586
5587 // If we also have the denorm mode, get just the denorm mode bits.
5588 ImmVal >>= 4;
5589 }
5590
5591 if (SetDenormOp) {
5592 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5593 .addImm(ImmVal & 0xf);
5594 }
5595
5596 MI.eraseFromParent();
5597 return BB;
5598 }
5599 }
5600 }
5601
5602 // If only FP bits are touched, used the no side effects pseudo.
5603 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5604 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5605 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5606
5607 return BB;
5608 }
5609 case AMDGPU::S_INVERSE_BALLOT_U32:
5610 case AMDGPU::S_INVERSE_BALLOT_U64:
5611 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5612 // necessary. After that they are equivalent to a COPY.
5613 MI.setDesc(TII->get(AMDGPU::COPY));
5614 return BB;
5615 case AMDGPU::ENDPGM_TRAP: {
5616 const DebugLoc &DL = MI.getDebugLoc();
5617 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5618 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5619 MI.addOperand(MachineOperand::CreateImm(0));
5620 return BB;
5621 }
5622
5623 // We need a block split to make the real endpgm a terminator. We also don't
5624 // want to break phis in successor blocks, so we can't just delete to the
5625 // end of the block.
5626
5627 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5629 MF->push_back(TrapBB);
5630 // clang-format off
5631 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5632 .addImm(0);
5633 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5634 .addMBB(TrapBB);
5635 // clang-format on
5636
5637 BB->addSuccessor(TrapBB);
5638 MI.eraseFromParent();
5639 return SplitBB;
5640 }
5641 case AMDGPU::SIMULATED_TRAP: {
5642 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5644 MachineBasicBlock *SplitBB =
5645 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5646 MI.eraseFromParent();
5647 return SplitBB;
5648 }
5649 default:
5650 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5651 if (!MI.mayStore())
5653 return BB;
5654 }
5656 }
5657}
5658
5660 // This currently forces unfolding various combinations of fsub into fma with
5661 // free fneg'd operands. As long as we have fast FMA (controlled by
5662 // isFMAFasterThanFMulAndFAdd), we should perform these.
5663
5664 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5665 // most of these combines appear to be cycle neutral but save on instruction
5666 // count / code size.
5667 return true;
5668}
5669
5671
5673 EVT VT) const {
5674 if (!VT.isVector()) {
5675 return MVT::i1;
5676 }
5677 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5678}
5679
5681 // TODO: Should i16 be used always if legal? For now it would force VALU
5682 // shifts.
5683 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5684}
5685
5687 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5688 ? Ty.changeElementSize(16)
5689 : Ty.changeElementSize(32);
5690}
5691
5692// Answering this is somewhat tricky and depends on the specific device which
5693// have different rates for fma or all f64 operations.
5694//
5695// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5696// regardless of which device (although the number of cycles differs between
5697// devices), so it is always profitable for f64.
5698//
5699// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5700// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5701// which we can always do even without fused FP ops since it returns the same
5702// result as the separate operations and since it is always full
5703// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5704// however does not support denormals, so we do report fma as faster if we have
5705// a fast fma device and require denormals.
5706//
5708 EVT VT) const {
5709 VT = VT.getScalarType();
5710
5711 switch (VT.getSimpleVT().SimpleTy) {
5712 case MVT::f32: {
5713 // If mad is not available this depends only on if f32 fma is full rate.
5714 if (!Subtarget->hasMadMacF32Insts())
5715 return Subtarget->hasFastFMAF32();
5716
5717 // Otherwise f32 mad is always full rate and returns the same result as
5718 // the separate operations so should be preferred over fma.
5719 // However does not support denormals.
5721 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5722
5723 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5724 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5725 }
5726 case MVT::f64:
5727 return true;
5728 case MVT::f16:
5729 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5730 default:
5731 break;
5732 }
5733
5734 return false;
5735}
5736
5738 LLT Ty) const {
5739 switch (Ty.getScalarSizeInBits()) {
5740 case 16:
5741 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5742 case 32:
5743 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5744 case 64:
5745 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5746 default:
5747 break;
5748 }
5749
5750 return false;
5751}
5752
5753// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5754// specific details.
5756 Type *Ty) const {
5757 switch (Ty->getScalarSizeInBits()) {
5758 case 16: {
5760 return Subtarget->has16BitInsts() &&
5761 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5762 }
5763 case 32: {
5764 if (!Subtarget->hasMadMacF32Insts())
5765 return Subtarget->hasFastFMAF32();
5766
5768 if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5769 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5770
5771 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5772 }
5773 case 64:
5774 return true;
5775 default:
5776 break;
5777 }
5778
5779 return false;
5780}
5781
5783 if (!Ty.isScalar())
5784 return false;
5785
5786 if (Ty.getScalarSizeInBits() == 16)
5787 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5788 if (Ty.getScalarSizeInBits() == 32)
5789 return Subtarget->hasMadMacF32Insts() &&
5790 denormalModeIsFlushAllF32(*MI.getMF());
5791
5792 return false;
5793}
5794
5796 const SDNode *N) const {
5797 // TODO: Check future ftz flag
5798 // v_mad_f32/v_mac_f32 do not support denormals.
5799 EVT VT = N->getValueType(0);
5800 if (VT == MVT::f32)
5801 return Subtarget->hasMadMacF32Insts() &&
5803 if (VT == MVT::f16) {
5804 return Subtarget->hasMadF16() &&
5806 }
5807
5808 return false;
5809}
5810
5811//===----------------------------------------------------------------------===//
5812// Custom DAG Lowering Operations
5813//===----------------------------------------------------------------------===//
5814
5815// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5816// wider vector type is legal.
5818 SelectionDAG &DAG) const {
5819 unsigned Opc = Op.getOpcode();
5820 EVT VT = Op.getValueType();
5821 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5822 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5823 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5824 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5825
5826 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5827
5828 SDLoc SL(Op);
5829 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5830 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5831
5832 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5833}
5834
5835// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5836// wider vector type is legal.
5838 SelectionDAG &DAG) const {
5839 unsigned Opc = Op.getOpcode();
5840 EVT VT = Op.getValueType();
5841 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5842 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5843 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5844 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5845
5846 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5847 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5848
5849 SDLoc SL(Op);
5850
5851 SDValue OpLo =
5852 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5853 SDValue OpHi =
5854 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5855
5856 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5857}
5858
5860 SelectionDAG &DAG) const {
5861 unsigned Opc = Op.getOpcode();
5862 EVT VT = Op.getValueType();
5863 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5864 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5865 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5866 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5867 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5868 VT == MVT::v32bf16);
5869
5870 SDValue Op0 = Op.getOperand(0);
5871 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5872 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5873 : std::pair(Op0, Op0);
5874
5875 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5876 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5877
5878 SDLoc SL(Op);
5879 auto ResVT = DAG.GetSplitDestVTs(VT);
5880
5881 SDValue OpLo =
5882 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5883 SDValue OpHi =
5884 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5885
5886 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5887}
5888
5890 switch (Op.getOpcode()) {
5891 default:
5893 case ISD::BRCOND:
5894 return LowerBRCOND(Op, DAG);
5895 case ISD::RETURNADDR:
5896 return LowerRETURNADDR(Op, DAG);
5897 case ISD::LOAD: {
5898 SDValue Result = LowerLOAD(Op, DAG);
5899 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5900 "Load should return a value and a chain");
5901 return Result;
5902 }
5903 case ISD::FSQRT: {
5904 EVT VT = Op.getValueType();
5905 if (VT == MVT::f32)
5906 return lowerFSQRTF32(Op, DAG);
5907 if (VT == MVT::f64)
5908 return lowerFSQRTF64(Op, DAG);
5909 return SDValue();
5910 }
5911 case ISD::FSIN:
5912 case ISD::FCOS:
5913 return LowerTrig(Op, DAG);
5914 case ISD::SELECT:
5915 return LowerSELECT(Op, DAG);
5916 case ISD::FDIV:
5917 return LowerFDIV(Op, DAG);
5918 case ISD::FFREXP:
5919 return LowerFFREXP(Op, DAG);
5921 return LowerATOMIC_CMP_SWAP(Op, DAG);
5922 case ISD::STORE:
5923 return LowerSTORE(Op, DAG);
5924 case ISD::GlobalAddress: {
5927 return LowerGlobalAddress(MFI, Op, DAG);
5928 }
5930 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5932 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5934 return LowerINTRINSIC_VOID(Op, DAG);
5935 case ISD::ADDRSPACECAST:
5936 return lowerADDRSPACECAST(Op, DAG);
5938 return lowerINSERT_SUBVECTOR(Op, DAG);
5940 return lowerINSERT_VECTOR_ELT(Op, DAG);
5942 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5944 return lowerVECTOR_SHUFFLE(Op, DAG);
5946 return lowerSCALAR_TO_VECTOR(Op, DAG);
5947 case ISD::BUILD_VECTOR:
5948 return lowerBUILD_VECTOR(Op, DAG);
5949 case ISD::FP_ROUND:
5951 return lowerFP_ROUND(Op, DAG);
5952 case ISD::TRAP:
5953 return lowerTRAP(Op, DAG);
5954 case ISD::DEBUGTRAP:
5955 return lowerDEBUGTRAP(Op, DAG);
5956 case ISD::ABS:
5957 case ISD::FABS:
5958 case ISD::FNEG:
5959 case ISD::FCANONICALIZE:
5960 case ISD::BSWAP:
5961 return splitUnaryVectorOp(Op, DAG);
5962 case ISD::FMINNUM:
5963 case ISD::FMAXNUM:
5964 return lowerFMINNUM_FMAXNUM(Op, DAG);
5965 case ISD::FLDEXP:
5966 case ISD::STRICT_FLDEXP:
5967 return lowerFLDEXP(Op, DAG);
5968 case ISD::FMA:
5969 return splitTernaryVectorOp(Op, DAG);
5970 case ISD::FP_TO_SINT:
5971 case ISD::FP_TO_UINT:
5972 return LowerFP_TO_INT(Op, DAG);
5973 case ISD::SHL:
5974 case ISD::SRA:
5975 case ISD::SRL:
5976 case ISD::ADD:
5977 case ISD::SUB:
5978 case ISD::SMIN:
5979 case ISD::SMAX:
5980 case ISD::UMIN:
5981 case ISD::UMAX:
5982 case ISD::FADD:
5983 case ISD::FMUL:
5984 case ISD::FMINNUM_IEEE:
5985 case ISD::FMAXNUM_IEEE:
5986 case ISD::FMINIMUM:
5987 case ISD::FMAXIMUM:
5988 case ISD::FMINIMUMNUM:
5989 case ISD::FMAXIMUMNUM:
5990 case ISD::UADDSAT:
5991 case ISD::USUBSAT:
5992 case ISD::SADDSAT:
5993 case ISD::SSUBSAT:
5994 return splitBinaryVectorOp(Op, DAG);
5995 case ISD::MUL:
5996 return lowerMUL(Op, DAG);
5997 case ISD::SMULO:
5998 case ISD::UMULO:
5999 return lowerXMULO(Op, DAG);
6000 case ISD::SMUL_LOHI:
6001 case ISD::UMUL_LOHI:
6002 return lowerXMUL_LOHI(Op, DAG);
6004 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6005 case ISD::STACKSAVE:
6006 return LowerSTACKSAVE(Op, DAG);
6007 case ISD::GET_ROUNDING:
6008 return lowerGET_ROUNDING(Op, DAG);
6009 case ISD::SET_ROUNDING:
6010 return lowerSET_ROUNDING(Op, DAG);
6011 case ISD::PREFETCH:
6012 return lowerPREFETCH(Op, DAG);
6013 case ISD::FP_EXTEND:
6015 return lowerFP_EXTEND(Op, DAG);
6016 case ISD::GET_FPENV:
6017 return lowerGET_FPENV(Op, DAG);
6018 case ISD::SET_FPENV:
6019 return lowerSET_FPENV(Op, DAG);
6020 }
6021 return SDValue();
6022}
6023
6024// Used for D16: Casts the result of an instruction into the right vector,
6025// packs values if loads return unpacked values.
6027 const SDLoc &DL, SelectionDAG &DAG,
6028 bool Unpacked) {
6029 if (!LoadVT.isVector())
6030 return Result;
6031
6032 // Cast back to the original packed type or to a larger type that is a
6033 // multiple of 32 bit for D16. Widening the return type is a required for
6034 // legalization.
6035 EVT FittingLoadVT = LoadVT;
6036 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6037 FittingLoadVT =
6039 LoadVT.getVectorNumElements() + 1);
6040 }
6041
6042 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6043 // Truncate to v2i16/v4i16.
6044 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6045
6046 // Workaround legalizer not scalarizing truncate after vector op
6047 // legalization but not creating intermediate vector trunc.
6049 DAG.ExtractVectorElements(Result, Elts);
6050 for (SDValue &Elt : Elts)
6051 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6052
6053 // Pad illegal v1i16/v3fi6 to v4i16
6054 if ((LoadVT.getVectorNumElements() % 2) == 1)
6055 Elts.push_back(DAG.getUNDEF(MVT::i16));
6056
6057 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6058
6059 // Bitcast to original type (v2f16/v4f16).
6060 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6061 }
6062
6063 // Cast back to the original packed type.
6064 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6065}
6066
6067SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6068 SelectionDAG &DAG,
6070 bool IsIntrinsic) const {
6071 SDLoc DL(M);
6072
6073 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6074 EVT LoadVT = M->getValueType(0);
6075
6076 EVT EquivLoadVT = LoadVT;
6077 if (LoadVT.isVector()) {
6078 if (Unpacked) {
6079 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6080 LoadVT.getVectorNumElements());
6081 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6082 // Widen v3f16 to legal type
6083 EquivLoadVT =
6085 LoadVT.getVectorNumElements() + 1);
6086 }
6087 }
6088
6089 // Change from v4f16/v2f16 to EquivLoadVT.
6090 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6091
6093 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6094 M->getMemoryVT(), M->getMemOperand());
6095
6096 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6097
6098 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6099}
6100
6101SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6102 SelectionDAG &DAG,
6103 ArrayRef<SDValue> Ops) const {
6104 SDLoc DL(M);
6105 EVT LoadVT = M->getValueType(0);
6106 EVT EltType = LoadVT.getScalarType();
6107 EVT IntVT = LoadVT.changeTypeToInteger();
6108
6109 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6110
6111 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6112 bool IsTFE = M->getNumValues() == 3;
6113
6114 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6118
6119 if (IsD16) {
6120 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6121 }
6122
6123 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6124 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6125 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6126 IsTFE);
6127
6128 if (isTypeLegal(LoadVT)) {
6129 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6130 M->getMemOperand(), DAG);
6131 }
6132
6133 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6134 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6135 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6136 M->getMemOperand(), DAG);
6137 return DAG.getMergeValues(
6138 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6139 DL);
6140}
6141
6143 SelectionDAG &DAG) {
6144 EVT VT = N->getValueType(0);
6145 unsigned CondCode = N->getConstantOperandVal(3);
6146 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6147 return DAG.getUNDEF(VT);
6148
6149 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6150
6151 SDValue LHS = N->getOperand(1);
6152 SDValue RHS = N->getOperand(2);
6153
6154 SDLoc DL(N);
6155
6156 EVT CmpVT = LHS.getValueType();
6157 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6158 unsigned PromoteOp =
6160 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6161 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6162 }
6163
6164 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6165
6166 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6167 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6168
6169 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6170 DAG.getCondCode(CCOpcode));
6171 if (VT.bitsEq(CCVT))
6172 return SetCC;
6173 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6174}
6175
6177 SelectionDAG &DAG) {
6178 EVT VT = N->getValueType(0);
6179
6180 unsigned CondCode = N->getConstantOperandVal(3);
6181 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6182 return DAG.getUNDEF(VT);
6183
6184 SDValue Src0 = N->getOperand(1);
6185 SDValue Src1 = N->getOperand(2);
6186 EVT CmpVT = Src0.getValueType();
6187 SDLoc SL(N);
6188
6189 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6190 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6191 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6192 }
6193
6194 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6195 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6196 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6197 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6198 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6199 DAG.getCondCode(CCOpcode));
6200 if (VT.bitsEq(CCVT))
6201 return SetCC;
6202 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6203}
6204
6206 SelectionDAG &DAG) {
6207 EVT VT = N->getValueType(0);
6208 SDValue Src = N->getOperand(1);
6209 SDLoc SL(N);
6210
6211 if (Src.getOpcode() == ISD::SETCC) {
6212 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6213 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6214 Src.getOperand(1), Src.getOperand(2));
6215 }
6216 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6217 // (ballot 0) -> 0
6218 if (Arg->isZero())
6219 return DAG.getConstant(0, SL, VT);
6220
6221 // (ballot 1) -> EXEC/EXEC_LO
6222 if (Arg->isOne()) {
6223 Register Exec;
6224 if (VT.getScalarSizeInBits() == 32)
6225 Exec = AMDGPU::EXEC_LO;
6226 else if (VT.getScalarSizeInBits() == 64)
6227 Exec = AMDGPU::EXEC;
6228 else
6229 return SDValue();
6230
6231 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6232 }
6233 }
6234
6235 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6236 // ISD::SETNE)
6237 return DAG.getNode(
6238 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6239 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6240}
6241
6243 SelectionDAG &DAG) {
6244 EVT VT = N->getValueType(0);
6245 unsigned ValSize = VT.getSizeInBits();
6246 unsigned IID = N->getConstantOperandVal(0);
6247 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6248 IID == Intrinsic::amdgcn_permlanex16;
6249 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6250 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6251 SDLoc SL(N);
6252 MVT IntVT = MVT::getIntegerVT(ValSize);
6253 const GCNSubtarget *ST = TLI.getSubtarget();
6254 unsigned SplitSize = 32;
6255 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6256 ST->hasDPALU_DPP() &&
6257 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6258 SplitSize = 64;
6259
6260 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6261 SDValue Src2, MVT ValT) -> SDValue {
6263 switch (IID) {
6264 case Intrinsic::amdgcn_permlane16:
6265 case Intrinsic::amdgcn_permlanex16:
6266 case Intrinsic::amdgcn_update_dpp:
6267 Operands.push_back(N->getOperand(6));
6268 Operands.push_back(N->getOperand(5));
6269 Operands.push_back(N->getOperand(4));
6270 [[fallthrough]];
6271 case Intrinsic::amdgcn_writelane:
6272 Operands.push_back(Src2);
6273 [[fallthrough]];
6274 case Intrinsic::amdgcn_readlane:
6275 case Intrinsic::amdgcn_set_inactive:
6276 case Intrinsic::amdgcn_set_inactive_chain_arg:
6277 case Intrinsic::amdgcn_mov_dpp8:
6278 Operands.push_back(Src1);
6279 [[fallthrough]];
6280 case Intrinsic::amdgcn_readfirstlane:
6281 case Intrinsic::amdgcn_permlane64:
6282 Operands.push_back(Src0);
6283 break;
6284 default:
6285 llvm_unreachable("unhandled lane op");
6286 }
6287
6288 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6289 std::reverse(Operands.begin(), Operands.end());
6290
6291 if (SDNode *GL = N->getGluedNode()) {
6292 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6293 GL = GL->getOperand(0).getNode();
6294 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6295 SDValue(GL, 0)));
6296 }
6297
6298 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6299 };
6300
6301 SDValue Src0 = N->getOperand(1);
6302 SDValue Src1, Src2;
6303 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6304 IID == Intrinsic::amdgcn_mov_dpp8 ||
6305 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6306 Src1 = N->getOperand(2);
6307 if (IID == Intrinsic::amdgcn_writelane ||
6308 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6309 Src2 = N->getOperand(3);
6310 }
6311
6312 if (ValSize == SplitSize) {
6313 // Already legal
6314 return SDValue();
6315 }
6316
6317 if (ValSize < 32) {
6318 bool IsFloat = VT.isFloatingPoint();
6319 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6320 SL, MVT::i32);
6321
6322 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6323 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6324 SL, MVT::i32);
6325 }
6326
6327 if (IID == Intrinsic::amdgcn_writelane) {
6328 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6329 SL, MVT::i32);
6330 }
6331
6332 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6333 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6334 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6335 }
6336
6337 if (ValSize % SplitSize != 0)
6338 return SDValue();
6339
6340 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6341 EVT VT = N->getValueType(0);
6342 unsigned NE = VT.getVectorNumElements();
6343 EVT EltVT = VT.getVectorElementType();
6345 unsigned NumOperands = N->getNumOperands();
6346 SmallVector<SDValue, 4> Operands(NumOperands);
6347 SDNode *GL = N->getGluedNode();
6348
6349 // only handle convergencectrl_glue
6351
6352 for (unsigned i = 0; i != NE; ++i) {
6353 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6354 ++j) {
6355 SDValue Operand = N->getOperand(j);
6356 EVT OperandVT = Operand.getValueType();
6357 if (OperandVT.isVector()) {
6358 // A vector operand; extract a single element.
6359 EVT OperandEltVT = OperandVT.getVectorElementType();
6360 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6361 Operand, DAG.getVectorIdxConstant(i, SL));
6362 } else {
6363 // A scalar operand; just use it as is.
6364 Operands[j] = Operand;
6365 }
6366 }
6367
6368 if (GL)
6369 Operands[NumOperands - 1] =
6370 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6371 SDValue(GL->getOperand(0).getNode(), 0));
6372
6373 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6374 }
6375
6376 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6377 return DAG.getBuildVector(VecVT, SL, Scalars);
6378 };
6379
6380 if (VT.isVector()) {
6381 switch (MVT::SimpleValueType EltTy =
6383 case MVT::i32:
6384 case MVT::f32:
6385 if (SplitSize == 32) {
6386 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6387 return unrollLaneOp(LaneOp.getNode());
6388 }
6389 [[fallthrough]];
6390 case MVT::i16:
6391 case MVT::f16:
6392 case MVT::bf16: {
6393 unsigned SubVecNumElt =
6394 SplitSize / VT.getVectorElementType().getSizeInBits();
6395 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6397 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6398 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6399 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6400 DAG.getConstant(EltIdx, SL, MVT::i32));
6401
6402 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6403 IsPermLane16)
6404 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6405 DAG.getConstant(EltIdx, SL, MVT::i32));
6406
6407 if (IID == Intrinsic::amdgcn_writelane)
6408 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6409 DAG.getConstant(EltIdx, SL, MVT::i32));
6410
6411 Pieces.push_back(
6412 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6413 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6414 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6415 EltIdx += SubVecNumElt;
6416 }
6417 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6418 }
6419 default:
6420 // Handle all other cases by bitcasting to i32 vectors
6421 break;
6422 }
6423 }
6424
6425 MVT VecVT =
6426 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6427 Src0 = DAG.getBitcast(VecVT, Src0);
6428
6429 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6430 Src1 = DAG.getBitcast(VecVT, Src1);
6431
6432 if (IID == Intrinsic::amdgcn_writelane)
6433 Src2 = DAG.getBitcast(VecVT, Src2);
6434
6435 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6436 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6437 return DAG.getBitcast(VT, UnrolledLaneOp);
6438}
6439
6442 SelectionDAG &DAG) const {
6443 switch (N->getOpcode()) {
6445 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6446 Results.push_back(Res);
6447 return;
6448 }
6450 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6451 Results.push_back(Res);
6452 return;
6453 }
6455 unsigned IID = N->getConstantOperandVal(0);
6456 switch (IID) {
6457 case Intrinsic::amdgcn_make_buffer_rsrc:
6458 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6459 return;
6460 case Intrinsic::amdgcn_cvt_pkrtz: {
6461 SDValue Src0 = N->getOperand(1);
6462 SDValue Src1 = N->getOperand(2);
6463 SDLoc SL(N);
6464 SDValue Cvt =
6465 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6466 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6467 return;
6468 }
6469 case Intrinsic::amdgcn_cvt_pknorm_i16:
6470 case Intrinsic::amdgcn_cvt_pknorm_u16:
6471 case Intrinsic::amdgcn_cvt_pk_i16:
6472 case Intrinsic::amdgcn_cvt_pk_u16: {
6473 SDValue Src0 = N->getOperand(1);
6474 SDValue Src1 = N->getOperand(2);
6475 SDLoc SL(N);
6476 unsigned Opcode;
6477
6478 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6480 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6482 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6484 else
6486
6487 EVT VT = N->getValueType(0);
6488 if (isTypeLegal(VT))
6489 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6490 else {
6491 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6492 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6493 }
6494 return;
6495 }
6496 case Intrinsic::amdgcn_s_buffer_load: {
6497 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6498 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6499 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6500 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6501 // s_buffer_load_i8.
6502 if (!Subtarget->hasScalarSubwordLoads())
6503 return;
6504 SDValue Op = SDValue(N, 0);
6505 SDValue Rsrc = Op.getOperand(1);
6506 SDValue Offset = Op.getOperand(2);
6507 SDValue CachePolicy = Op.getOperand(3);
6508 EVT VT = Op.getValueType();
6509 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6510 SDLoc DL(Op);
6512 const DataLayout &DataLayout = DAG.getDataLayout();
6513 Align Alignment =
6519 VT.getStoreSize(), Alignment);
6520 SDValue LoadVal;
6521 if (!Offset->isDivergent()) {
6522 SDValue Ops[] = {Rsrc, // source register
6523 Offset, CachePolicy};
6524 SDValue BufferLoad =
6526 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6527 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6528 } else {
6529 SDValue Ops[] = {
6530 DAG.getEntryNode(), // Chain
6531 Rsrc, // rsrc
6532 DAG.getConstant(0, DL, MVT::i32), // vindex
6533 {}, // voffset
6534 {}, // soffset
6535 {}, // offset
6536 CachePolicy, // cachepolicy
6537 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6538 };
6539 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6540 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6541 }
6542 Results.push_back(LoadVal);
6543 return;
6544 }
6545 }
6546 break;
6547 }
6549 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6550 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6551 // FIXME: Hacky
6552 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6553 Results.push_back(Res.getOperand(I));
6554 }
6555 } else {
6556 Results.push_back(Res);
6557 Results.push_back(Res.getValue(1));
6558 }
6559 return;
6560 }
6561
6562 break;
6563 }
6564 case ISD::SELECT: {
6565 SDLoc SL(N);
6566 EVT VT = N->getValueType(0);
6567 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6568 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6569 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6570
6571 EVT SelectVT = NewVT;
6572 if (NewVT.bitsLT(MVT::i32)) {
6573 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6574 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6575 SelectVT = MVT::i32;
6576 }
6577
6578 SDValue NewSelect =
6579 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6580
6581 if (NewVT != SelectVT)
6582 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6583 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6584 return;
6585 }
6586 case ISD::FNEG: {
6587 if (N->getValueType(0) != MVT::v2f16)
6588 break;
6589
6590 SDLoc SL(N);
6591 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6592
6593 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6594 DAG.getConstant(0x80008000, SL, MVT::i32));
6595 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6596 return;
6597 }
6598 case ISD::FABS: {
6599 if (N->getValueType(0) != MVT::v2f16)
6600 break;
6601
6602 SDLoc SL(N);
6603 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6604
6605 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6606 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6607 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6608 return;
6609 }
6610 case ISD::FSQRT: {
6611 if (N->getValueType(0) != MVT::f16)
6612 break;
6613 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6614 break;
6615 }
6616 default:
6618 break;
6619 }
6620}
6621
6622/// Helper function for LowerBRCOND
6623static SDNode *findUser(SDValue Value, unsigned Opcode) {
6624
6625 for (SDUse &U : Value->uses()) {
6626 if (U.get() != Value)
6627 continue;
6628
6629 if (U.getUser()->getOpcode() == Opcode)
6630 return U.getUser();
6631 }
6632 return nullptr;
6633}
6634
6635unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6636 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6637 switch (Intr->getConstantOperandVal(1)) {
6638 case Intrinsic::amdgcn_if:
6639 return AMDGPUISD::IF;
6640 case Intrinsic::amdgcn_else:
6641 return AMDGPUISD::ELSE;
6642 case Intrinsic::amdgcn_loop:
6643 return AMDGPUISD::LOOP;
6644 case Intrinsic::amdgcn_end_cf:
6645 llvm_unreachable("should not occur");
6646 default:
6647 return 0;
6648 }
6649 }
6650
6651 // break, if_break, else_break are all only used as inputs to loop, not
6652 // directly as branch conditions.
6653 return 0;
6654}
6655
6657 const Triple &TT = getTargetMachine().getTargetTriple();
6661}
6662
6664 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6665 return false;
6666
6667 // FIXME: Either avoid relying on address space here or change the default
6668 // address space for functions to avoid the explicit check.
6669 return (GV->getValueType()->isFunctionTy() ||
6672}
6673
6675 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6676}
6677
6679 if (!GV->hasExternalLinkage())
6680 return true;
6681
6682 const auto OS = getTargetMachine().getTargetTriple().getOS();
6683 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6684}
6685
6686/// This transforms the control flow intrinsics to get the branch destination as
6687/// last parameter, also switches branch target with BR if the need arise
6688SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6689 SDLoc DL(BRCOND);
6690
6691 SDNode *Intr = BRCOND.getOperand(1).getNode();
6692 SDValue Target = BRCOND.getOperand(2);
6693 SDNode *BR = nullptr;
6694 SDNode *SetCC = nullptr;
6695
6696 if (Intr->getOpcode() == ISD::SETCC) {
6697 // As long as we negate the condition everything is fine
6698 SetCC = Intr;
6699 Intr = SetCC->getOperand(0).getNode();
6700
6701 } else {
6702 // Get the target from BR if we don't negate the condition
6703 BR = findUser(BRCOND, ISD::BR);
6704 assert(BR && "brcond missing unconditional branch user");
6705 Target = BR->getOperand(1);
6706 }
6707
6708 unsigned CFNode = isCFIntrinsic(Intr);
6709 if (CFNode == 0) {
6710 // This is a uniform branch so we don't need to legalize.
6711 return BRCOND;
6712 }
6713
6714 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6715 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6716
6717 assert(!SetCC ||
6718 (SetCC->getConstantOperandVal(1) == 1 &&
6719 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6720 ISD::SETNE));
6721
6722 // operands of the new intrinsic call
6724 if (HaveChain)
6725 Ops.push_back(BRCOND.getOperand(0));
6726
6727 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6728 Ops.push_back(Target);
6729
6730 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6731
6732 // build the new intrinsic call
6733 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6734
6735 if (!HaveChain) {
6736 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6737
6738 Result = DAG.getMergeValues(Ops, DL).getNode();
6739 }
6740
6741 if (BR) {
6742 // Give the branch instruction our target
6743 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6744 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6745 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6746 }
6747
6748 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6749
6750 // Copy the intrinsic results to registers
6751 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6753 if (!CopyToReg)
6754 continue;
6755
6756 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6757 SDValue(Result, i - 1), SDValue());
6758
6759 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6760 }
6761
6762 // Remove the old intrinsic from the chain
6763 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6764 Intr->getOperand(0));
6765
6766 return Chain;
6767}
6768
6769SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6770 MVT VT = Op.getSimpleValueType();
6771 SDLoc DL(Op);
6772 // Checking the depth
6773 if (Op.getConstantOperandVal(0) != 0)
6774 return DAG.getConstant(0, DL, VT);
6775
6778 // Check for kernel and shader functions
6779 if (Info->isEntryFunction())
6780 return DAG.getConstant(0, DL, VT);
6781
6782 MachineFrameInfo &MFI = MF.getFrameInfo();
6783 // There is a call to @llvm.returnaddress in this function
6784 MFI.setReturnAddressIsTaken(true);
6785
6787 // Get the return address reg and mark it as an implicit live-in
6788 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6789 getRegClassFor(VT, Op.getNode()->isDivergent()));
6790
6791 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6792}
6793
6794SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6795 const SDLoc &DL, EVT VT) const {
6796 return Op.getValueType().bitsLE(VT)
6797 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6798 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6799 DAG.getTargetConstant(0, DL, MVT::i32));
6800}
6801
6802SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6803 assert(Op.getValueType() == MVT::f16 &&
6804 "Do not know how to custom lower FP_ROUND for non-f16 type");
6805
6806 SDValue Src = Op.getOperand(0);
6807 EVT SrcVT = Src.getValueType();
6808 if (SrcVT != MVT::f64)
6809 return Op;
6810
6811 // TODO: Handle strictfp
6812 if (Op.getOpcode() != ISD::FP_ROUND)
6813 return Op;
6814
6815 SDLoc DL(Op);
6816
6817 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6818 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6819 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6820}
6821
6822SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6823 SelectionDAG &DAG) const {
6824 EVT VT = Op.getValueType();
6825 const MachineFunction &MF = DAG.getMachineFunction();
6827 bool IsIEEEMode = Info->getMode().IEEE;
6828
6829 // FIXME: Assert during selection that this is only selected for
6830 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6831 // mode functions, but this happens to be OK since it's only done in cases
6832 // where there is known no sNaN.
6833 if (IsIEEEMode)
6834 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6835
6836 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6837 VT == MVT::v16bf16)
6838 return splitBinaryVectorOp(Op, DAG);
6839 return Op;
6840}
6841
6842SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6843 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6844 EVT VT = Op.getValueType();
6845 assert(VT == MVT::f16);
6846
6847 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6848 EVT ExpVT = Exp.getValueType();
6849 if (ExpVT == MVT::i16)
6850 return Op;
6851
6852 SDLoc DL(Op);
6853
6854 // Correct the exponent type for f16 to i16.
6855 // Clamp the range of the exponent to the instruction's range.
6856
6857 // TODO: This should be a generic narrowing legalization, and can easily be
6858 // for GlobalISel.
6859
6860 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6861 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6862
6863 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6864 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6865
6866 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6867
6868 if (IsStrict) {
6869 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6870 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6871 }
6872
6873 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6874}
6875
6877 switch (Op->getOpcode()) {
6878 case ISD::SRA:
6879 case ISD::SMIN:
6880 case ISD::SMAX:
6881 return ISD::SIGN_EXTEND;
6882 case ISD::SRL:
6883 case ISD::UMIN:
6884 case ISD::UMAX:
6885 return ISD::ZERO_EXTEND;
6886 case ISD::ADD:
6887 case ISD::SUB:
6888 case ISD::AND:
6889 case ISD::OR:
6890 case ISD::XOR:
6891 case ISD::SHL:
6892 case ISD::SELECT:
6893 case ISD::MUL:
6894 // operation result won't be influenced by garbage high bits.
6895 // TODO: are all of those cases correct, and are there more?
6896 return ISD::ANY_EXTEND;
6897 case ISD::SETCC: {
6898 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6900 }
6901 default:
6902 llvm_unreachable("unexpected opcode!");
6903 }
6904}
6905
6906SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6907 DAGCombinerInfo &DCI) const {
6908 const unsigned Opc = Op.getOpcode();
6909 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6910 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6911 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6912 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6913 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6914
6915 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6916 : Op->getOperand(0).getValueType();
6917 auto ExtTy = OpTy.changeElementType(MVT::i32);
6918
6919 if (DCI.isBeforeLegalizeOps() ||
6920 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6921 return SDValue();
6922
6923 auto &DAG = DCI.DAG;
6924
6925 SDLoc DL(Op);
6926 SDValue LHS;
6927 SDValue RHS;
6928 if (Opc == ISD::SELECT) {
6929 LHS = Op->getOperand(1);
6930 RHS = Op->getOperand(2);
6931 } else {
6932 LHS = Op->getOperand(0);
6933 RHS = Op->getOperand(1);
6934 }
6935
6936 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6937 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6938
6939 // Special case: for shifts, the RHS always needs a zext.
6940 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6941 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6942 else
6943 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6944
6945 // setcc always return i1/i1 vec so no need to truncate after.
6946 if (Opc == ISD::SETCC) {
6947 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6948 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6949 }
6950
6951 // For other ops, we extend the operation's return type as well so we need to
6952 // truncate back to the original type.
6953 SDValue NewVal;
6954 if (Opc == ISD::SELECT)
6955 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6956 else
6957 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6958
6959 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6960}
6961
6962// Custom lowering for vector multiplications and s_mul_u64.
6963SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6964 EVT VT = Op.getValueType();
6965
6966 // Split vector operands.
6967 if (VT.isVector())
6968 return splitBinaryVectorOp(Op, DAG);
6969
6970 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6971
6972 // There are four ways to lower s_mul_u64:
6973 //
6974 // 1. If all the operands are uniform, then we lower it as it is.
6975 //
6976 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6977 // multiplications because there is not a vector equivalent of s_mul_u64.
6978 //
6979 // 3. If the cost model decides that it is more efficient to use vector
6980 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6981 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6982 //
6983 // 4. If the cost model decides to use vector registers and both of the
6984 // operands are zero-extended/sign-extended from 32-bits, then we split the
6985 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6986 // possible to check if the operands are zero-extended or sign-extended in
6987 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6988 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6989 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6990 // If the cost model decides that we have to use vector registers, then
6991 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6992 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6993 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6994 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6995 // SIInstrInfo.cpp .
6996
6997 if (Op->isDivergent())
6998 return SDValue();
6999
7000 SDValue Op0 = Op.getOperand(0);
7001 SDValue Op1 = Op.getOperand(1);
7002 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7003 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7004 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7005 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7006 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7007 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7008 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7009 SDLoc SL(Op);
7010 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7011 return SDValue(
7012 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7013 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7014 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7015 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7016 return SDValue(
7017 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7018 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7019 return Op;
7020}
7021
7022SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7023 EVT VT = Op.getValueType();
7024 SDLoc SL(Op);
7025 SDValue LHS = Op.getOperand(0);
7026 SDValue RHS = Op.getOperand(1);
7027 bool isSigned = Op.getOpcode() == ISD::SMULO;
7028
7029 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7030 const APInt &C = RHSC->getAPIntValue();
7031 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7032 if (C.isPowerOf2()) {
7033 // smulo(x, signed_min) is same as umulo(x, signed_min).
7034 bool UseArithShift = isSigned && !C.isMinSignedValue();
7035 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7036 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7037 SDValue Overflow =
7038 DAG.getSetCC(SL, MVT::i1,
7039 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7040 Result, ShiftAmt),
7041 LHS, ISD::SETNE);
7042 return DAG.getMergeValues({Result, Overflow}, SL);
7043 }
7044 }
7045
7046 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7047 SDValue Top =
7048 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7049
7050 SDValue Sign = isSigned
7051 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7052 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7053 SL, MVT::i32))
7054 : DAG.getConstant(0, SL, VT);
7055 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7056
7057 return DAG.getMergeValues({Result, Overflow}, SL);
7058}
7059
7060SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7061 if (Op->isDivergent()) {
7062 // Select to V_MAD_[IU]64_[IU]32.
7063 return Op;
7064 }
7065 if (Subtarget->hasSMulHi()) {
7066 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7067 return SDValue();
7068 }
7069 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7070 // calculate the high part, so we might as well do the whole thing with
7071 // V_MAD_[IU]64_[IU]32.
7072 return Op;
7073}
7074
7075SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7076 if (!Subtarget->isTrapHandlerEnabled() ||
7078 return lowerTrapEndpgm(Op, DAG);
7079
7080 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7081 : lowerTrapHsaQueuePtr(Op, DAG);
7082}
7083
7084SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7085 SDLoc SL(Op);
7086 SDValue Chain = Op.getOperand(0);
7087 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7088}
7089
7090SDValue
7091SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7092 const SDLoc &DL, Align Alignment,
7093 ImplicitParameter Param) const {
7096 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7098 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7101}
7102
7103SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7104 SelectionDAG &DAG) const {
7105 SDLoc SL(Op);
7106 SDValue Chain = Op.getOperand(0);
7107
7108 SDValue QueuePtr;
7109 // For code object version 5, QueuePtr is passed through implicit kernarg.
7110 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7112 QueuePtr =
7113 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7114 } else {
7117 Register UserSGPR = Info->getQueuePtrUserSGPR();
7118
7119 if (UserSGPR == AMDGPU::NoRegister) {
7120 // We probably are in a function incorrectly marked with
7121 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7122 // trap, so just use a null pointer.
7123 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7124 } else {
7125 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7126 MVT::i64);
7127 }
7128 }
7129
7130 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7131 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7132
7134 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7135 ToReg.getValue(1)};
7136 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7137}
7138
7139SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7140 SDLoc SL(Op);
7141 SDValue Chain = Op.getOperand(0);
7142
7143 // We need to simulate the 's_trap 2' instruction on targets that run in
7144 // PRIV=1 (where it is treated as a nop).
7145 if (Subtarget->hasPrivEnabledTrap2NopBug())
7146 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7147
7149 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7150 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7151}
7152
7153SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7154 SDLoc SL(Op);
7155 SDValue Chain = Op.getOperand(0);
7157
7158 if (!Subtarget->isTrapHandlerEnabled() ||
7161 "debugtrap handler not supported",
7162 Op.getDebugLoc(), DS_Warning);
7163 LLVMContext &Ctx = MF.getFunction().getContext();
7164 Ctx.diagnose(NoTrap);
7165 return Chain;
7166 }
7167
7168 uint64_t TrapID =
7170 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7171 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7172}
7173
7174SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7175 SelectionDAG &DAG) const {
7176 if (Subtarget->hasApertureRegs()) {
7177 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7178 ? AMDGPU::SRC_SHARED_BASE
7179 : AMDGPU::SRC_PRIVATE_BASE;
7180 // Note: this feature (register) is broken. When used as a 32-bit operand,
7181 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7182 // bits.
7183 //
7184 // To work around the issue, directly emit a 64 bit mov from this register
7185 // then extract the high bits. Note that this shouldn't even result in a
7186 // shift being emitted and simply become a pair of registers (e.g.):
7187 // s_mov_b64 s[6:7], src_shared_base
7188 // v_mov_b32_e32 v1, s7
7189 //
7190 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7191 // coalescing would kick in and it would think it's okay to use the "HI"
7192 // subregister directly (instead of extracting the HI 32 bits) which is an
7193 // artificial (unusable) register.
7194 // Register TableGen definitions would need an overhaul to get rid of the
7195 // artificial "HI" aperture registers and prevent this kind of issue from
7196 // happening.
7197 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7198 DAG.getRegister(ApertureRegNo, MVT::i64));
7199 return DAG.getNode(
7200 ISD::TRUNCATE, DL, MVT::i32,
7201 DAG.getNode(ISD::SRL, DL, MVT::i64,
7202 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7203 }
7204
7205 // For code object version 5, private_base and shared_base are passed through
7206 // implicit kernargs.
7207 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7211 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7212 }
7213
7216 Register UserSGPR = Info->getQueuePtrUserSGPR();
7217 if (UserSGPR == AMDGPU::NoRegister) {
7218 // We probably are in a function incorrectly marked with
7219 // amdgpu-no-queue-ptr. This is undefined.
7220 return DAG.getUNDEF(MVT::i32);
7221 }
7222
7223 SDValue QueuePtr =
7224 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7225
7226 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7227 // private_segment_aperture_base_hi.
7228 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7229
7230 SDValue Ptr =
7231 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7232
7233 // TODO: Use custom target PseudoSourceValue.
7234 // TODO: We should use the value from the IR intrinsic call, but it might not
7235 // be available and how do we get it?
7237 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7238 commonAlignment(Align(64), StructOffset),
7241}
7242
7243/// Return true if the value is a known valid address, such that a null check is
7244/// not necessary.
7246 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7247 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7248 isa<BasicBlockSDNode>(Val))
7249 return true;
7250
7251 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7252 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7253
7254 // TODO: Search through arithmetic, handle arguments and loads
7255 // marked nonnull.
7256 return false;
7257}
7258
7259SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7260 SelectionDAG &DAG) const {
7261 SDLoc SL(Op);
7262
7263 const AMDGPUTargetMachine &TM =
7264 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7265
7266 unsigned DestAS, SrcAS;
7267 SDValue Src;
7268 bool IsNonNull = false;
7269 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7270 SrcAS = ASC->getSrcAddressSpace();
7271 Src = ASC->getOperand(0);
7272 DestAS = ASC->getDestAddressSpace();
7273 } else {
7274 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7275 Op.getConstantOperandVal(0) ==
7276 Intrinsic::amdgcn_addrspacecast_nonnull);
7277 Src = Op->getOperand(1);
7278 SrcAS = Op->getConstantOperandVal(2);
7279 DestAS = Op->getConstantOperandVal(3);
7280 IsNonNull = true;
7281 }
7282
7283 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7284
7285 // flat -> local/private
7286 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7287 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7288 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7289 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7290
7291 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7292 return Ptr;
7293
7294 unsigned NullVal = TM.getNullPointerValue(DestAS);
7295 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7296 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7297
7298 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7299 SegmentNullPtr);
7300 }
7301 }
7302
7303 // local/private -> flat
7304 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7305 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7306 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7307
7308 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7309 SDValue CvtPtr =
7310 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7311 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7312
7313 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7314 return CvtPtr;
7315
7316 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7317 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7318
7319 SDValue NonNull =
7320 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7321
7322 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7323 FlatNullPtr);
7324 }
7325 }
7326
7327 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7328 Op.getValueType() == MVT::i64) {
7331 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7332 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7333 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7334 }
7335
7336 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7337 Src.getValueType() == MVT::i64)
7338 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7339
7340 // global <-> flat are no-ops and never emitted.
7341
7342 const MachineFunction &MF = DAG.getMachineFunction();
7343 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7344 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7345 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7346
7347 return DAG.getUNDEF(Op->getValueType(0));
7348}
7349
7350// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7351// the small vector and inserting them into the big vector. That is better than
7352// the default expansion of doing it via a stack slot. Even though the use of
7353// the stack slot would be optimized away afterwards, the stack slot itself
7354// remains.
7355SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7356 SelectionDAG &DAG) const {
7357 SDValue Vec = Op.getOperand(0);
7358 SDValue Ins = Op.getOperand(1);
7359 SDValue Idx = Op.getOperand(2);
7360 EVT VecVT = Vec.getValueType();
7361 EVT InsVT = Ins.getValueType();
7362 EVT EltVT = VecVT.getVectorElementType();
7363 unsigned InsNumElts = InsVT.getVectorNumElements();
7364 unsigned IdxVal = Idx->getAsZExtVal();
7365 SDLoc SL(Op);
7366
7367 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7368 // Insert 32-bit registers at a time.
7369 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7370
7371 unsigned VecNumElts = VecVT.getVectorNumElements();
7372 EVT NewVecVT =
7373 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7374 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7376 MVT::i32, InsNumElts / 2);
7377
7378 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7379 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7380
7381 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7382 SDValue Elt;
7383 if (InsNumElts == 2) {
7384 Elt = Ins;
7385 } else {
7386 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7387 DAG.getConstant(I, SL, MVT::i32));
7388 }
7389 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7390 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7391 }
7392
7393 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7394 }
7395
7396 for (unsigned I = 0; I != InsNumElts; ++I) {
7397 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7398 DAG.getConstant(I, SL, MVT::i32));
7399 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7400 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7401 }
7402 return Vec;
7403}
7404
7405SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7406 SelectionDAG &DAG) const {
7407 SDValue Vec = Op.getOperand(0);
7408 SDValue InsVal = Op.getOperand(1);
7409 SDValue Idx = Op.getOperand(2);
7410 EVT VecVT = Vec.getValueType();
7411 EVT EltVT = VecVT.getVectorElementType();
7412 unsigned VecSize = VecVT.getSizeInBits();
7413 unsigned EltSize = EltVT.getSizeInBits();
7414 SDLoc SL(Op);
7415
7416 // Specially handle the case of v4i16 with static indexing.
7417 unsigned NumElts = VecVT.getVectorNumElements();
7418 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7419 if (NumElts == 4 && EltSize == 16 && KIdx) {
7420 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7421
7422 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7423 DAG.getConstant(0, SL, MVT::i32));
7424 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7425 DAG.getConstant(1, SL, MVT::i32));
7426
7427 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7428 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7429
7430 unsigned Idx = KIdx->getZExtValue();
7431 bool InsertLo = Idx < 2;
7432 SDValue InsHalf = DAG.getNode(
7433 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7434 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7435 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7436
7437 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7438
7439 SDValue Concat =
7440 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7441 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7442
7443 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7444 }
7445
7446 // Static indexing does not lower to stack access, and hence there is no need
7447 // for special custom lowering to avoid stack access.
7448 if (isa<ConstantSDNode>(Idx))
7449 return SDValue();
7450
7451 // Avoid stack access for dynamic indexing by custom lowering to
7452 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7453
7454 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7455
7456 MVT IntVT = MVT::getIntegerVT(VecSize);
7457
7458 // Convert vector index to bit-index and get the required bit mask.
7459 assert(isPowerOf2_32(EltSize));
7460 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7461 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7462 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7463 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7464 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7465
7466 // 1. Create a congruent vector with the target value in each element.
7467 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7468 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7469
7470 // 2. Mask off all other indices except the required index within (1).
7471 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7472
7473 // 3. Mask off the required index within the target vector.
7474 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7475 SDValue RHS =
7476 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7477
7478 // 4. Get (2) and (3) ORed into the target vector.
7479 SDValue BFI =
7480 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7481
7482 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7483}
7484
7485SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7486 SelectionDAG &DAG) const {
7487 SDLoc SL(Op);
7488
7489 EVT ResultVT = Op.getValueType();
7490 SDValue Vec = Op.getOperand(0);
7491 SDValue Idx = Op.getOperand(1);
7492 EVT VecVT = Vec.getValueType();
7493 unsigned VecSize = VecVT.getSizeInBits();
7494 EVT EltVT = VecVT.getVectorElementType();
7495
7496 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7497
7498 // Make sure we do any optimizations that will make it easier to fold
7499 // source modifiers before obscuring it with bit operations.
7500
7501 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7502 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7503 return Combined;
7504
7505 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7506 SDValue Lo, Hi;
7507 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7508
7509 if (VecSize == 128) {
7510 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7511 Lo = DAG.getBitcast(LoVT,
7512 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7513 DAG.getConstant(0, SL, MVT::i32)));
7514 Hi = DAG.getBitcast(HiVT,
7515 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7516 DAG.getConstant(1, SL, MVT::i32)));
7517 } else if (VecSize == 256) {
7518 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7519 SDValue Parts[4];
7520 for (unsigned P = 0; P < 4; ++P) {
7521 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7522 DAG.getConstant(P, SL, MVT::i32));
7523 }
7524
7525 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7526 Parts[0], Parts[1]));
7527 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7528 Parts[2], Parts[3]));
7529 } else {
7530 assert(VecSize == 512);
7531
7532 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7533 SDValue Parts[8];
7534 for (unsigned P = 0; P < 8; ++P) {
7535 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7536 DAG.getConstant(P, SL, MVT::i32));
7537 }
7538
7539 Lo = DAG.getBitcast(LoVT,
7540 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7541 Parts[0], Parts[1], Parts[2], Parts[3]));
7542 Hi = DAG.getBitcast(HiVT,
7543 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7544 Parts[4], Parts[5], Parts[6], Parts[7]));
7545 }
7546
7547 EVT IdxVT = Idx.getValueType();
7548 unsigned NElem = VecVT.getVectorNumElements();
7549 assert(isPowerOf2_32(NElem));
7550 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7551 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7552 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7553 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7554 }
7555
7556 assert(VecSize <= 64);
7557
7558 MVT IntVT = MVT::getIntegerVT(VecSize);
7559
7560 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7561 SDValue VecBC = peekThroughBitcasts(Vec);
7562 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7563 SDValue Src = VecBC.getOperand(0);
7564 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7565 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7566 }
7567
7568 unsigned EltSize = EltVT.getSizeInBits();
7569 assert(isPowerOf2_32(EltSize));
7570
7571 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7572
7573 // Convert vector index to bit-index (* EltSize)
7574 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7575
7576 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7577 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7578
7579 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7580 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7581 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7582 }
7583
7584 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7585}
7586
7587static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7588 assert(Elt % 2 == 0);
7589 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7590}
7591
7592SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7593 SelectionDAG &DAG) const {
7594 SDLoc SL(Op);
7595 EVT ResultVT = Op.getValueType();
7596 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7597 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7598 MVT PackVT = MVT::getVectorVT(EltVT, 2);
7599 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7600
7601 // vector_shuffle <0,1,6,7> lhs, rhs
7602 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7603 //
7604 // vector_shuffle <6,7,2,3> lhs, rhs
7605 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7606 //
7607 // vector_shuffle <6,7,0,1> lhs, rhs
7608 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7609
7610 // Avoid scalarizing when both halves are reading from consecutive elements.
7612 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7613 if (elementPairIsContiguous(SVN->getMask(), I)) {
7614 const int Idx = SVN->getMaskElt(I);
7615 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7616 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7617 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7618 SVN->getOperand(VecIdx),
7619 DAG.getConstant(EltIdx, SL, MVT::i32));
7620 Pieces.push_back(SubVec);
7621 } else {
7622 const int Idx0 = SVN->getMaskElt(I);
7623 const int Idx1 = SVN->getMaskElt(I + 1);
7624 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7625 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7626 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7627 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7628
7629 SDValue Vec0 = SVN->getOperand(VecIdx0);
7630 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7631 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7632
7633 SDValue Vec1 = SVN->getOperand(VecIdx1);
7634 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7635 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7636 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7637 }
7638 }
7639
7640 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7641}
7642
7643SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7644 SelectionDAG &DAG) const {
7645 SDValue SVal = Op.getOperand(0);
7646 EVT ResultVT = Op.getValueType();
7647 EVT SValVT = SVal.getValueType();
7648 SDValue UndefVal = DAG.getUNDEF(SValVT);
7649 SDLoc SL(Op);
7650
7652 VElts.push_back(SVal);
7653 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7654 VElts.push_back(UndefVal);
7655
7656 return DAG.getBuildVector(ResultVT, SL, VElts);
7657}
7658
7659SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7660 SelectionDAG &DAG) const {
7661 SDLoc SL(Op);
7662 EVT VT = Op.getValueType();
7663
7664 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7665 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7666
7667 SDValue Lo = Op.getOperand(0);
7668 SDValue Hi = Op.getOperand(1);
7669
7670 // Avoid adding defined bits with the zero_extend.
7671 if (Hi.isUndef()) {
7672 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7673 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7674 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7675 }
7676
7677 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7678 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7679
7680 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7681 DAG.getConstant(16, SL, MVT::i32));
7682 if (Lo.isUndef())
7683 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7684
7685 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7686 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7687
7688 SDValue Or =
7689 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7690 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7691 }
7692
7693 // Split into 2-element chunks.
7694 const unsigned NumParts = VT.getVectorNumElements() / 2;
7696 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7697
7699 for (unsigned P = 0; P < NumParts; ++P) {
7700 SDValue Vec = DAG.getBuildVector(
7701 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7702 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7703 }
7704
7705 SDValue Blend =
7706 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7707 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7708}
7709
7711 const GlobalAddressSDNode *GA) const {
7712 // OSes that use ELF REL relocations (instead of RELA) can only store a
7713 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7714 // which can create arbitrary 64-bit addends. (This is only a problem for
7715 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7716 // the high 32 bits of the addend.)
7717 //
7718 // This should be kept in sync with how HasRelocationAddend is initialized in
7719 // the constructor of ELFAMDGPUAsmBackend.
7720 if (!Subtarget->isAmdHsaOS())
7721 return false;
7722
7723 // We can fold offsets for anything that doesn't require a GOT relocation.
7724 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7728}
7729
7730static SDValue
7732 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7733 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7734 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7735 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7736 // lowered to the following code sequence:
7737 //
7738 // For constant address space:
7739 // s_getpc_b64 s[0:1]
7740 // s_add_u32 s0, s0, $symbol
7741 // s_addc_u32 s1, s1, 0
7742 //
7743 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7744 // a fixup or relocation is emitted to replace $symbol with a literal
7745 // constant, which is a pc-relative offset from the encoding of the $symbol
7746 // operand to the global variable.
7747 //
7748 // For global address space:
7749 // s_getpc_b64 s[0:1]
7750 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7751 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7752 //
7753 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7754 // fixups or relocations are emitted to replace $symbol@*@lo and
7755 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7756 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7757 // operand to the global variable.
7758 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7759 SDValue PtrHi;
7760 if (GAFlags == SIInstrInfo::MO_NONE)
7761 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7762 else
7763 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7764 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7765}
7766
7767SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7768 SDValue Op,
7769 SelectionDAG &DAG) const {
7770 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7771 SDLoc DL(GSD);
7772 EVT PtrVT = Op.getValueType();
7773
7774 const GlobalValue *GV = GSD->getGlobal();
7780 GV->hasExternalLinkage()) {
7781 Type *Ty = GV->getValueType();
7782 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7783 // zero-sized type in other languages to declare the dynamic shared
7784 // memory which size is not known at the compile time. They will be
7785 // allocated by the runtime and placed directly after the static
7786 // allocated ones. They all share the same offset.
7787 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7788 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7789 // Adjust alignment for that dynamic shared memory array.
7791 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7792 MFI->setUsesDynamicLDS(true);
7793 return SDValue(
7794 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7795 }
7796 }
7798 }
7799
7801 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7803 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7804 }
7805
7806 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7807 SDValue AddrLo = DAG.getTargetGlobalAddress(
7808 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7809 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7810
7811 SDValue AddrHi = DAG.getTargetGlobalAddress(
7812 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7813 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7814
7815 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7816 }
7817
7818 if (shouldEmitFixup(GV))
7819 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7820
7821 if (shouldEmitPCReloc(GV))
7822 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7824
7825 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7827 PointerType *PtrTy =
7829 const DataLayout &DataLayout = DAG.getDataLayout();
7830 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7831 MachinePointerInfo PtrInfo =
7833
7834 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7837}
7838
7840 const SDLoc &DL, SDValue V) const {
7841 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7842 // the destination register.
7843 //
7844 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7845 // so we will end up with redundant moves to m0.
7846 //
7847 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7848
7849 // A Null SDValue creates a glue result.
7850 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7851 V, Chain);
7852 return SDValue(M0, 0);
7853}
7854
7855SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7856 MVT VT,
7857 unsigned Offset) const {
7858 SDLoc SL(Op);
7859 SDValue Param = lowerKernargMemParameter(
7860 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7861 // The local size values will have the hi 16-bits as zero.
7862 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7863 DAG.getValueType(VT));
7864}
7865
7867 EVT VT) {
7869 "non-hsa intrinsic with hsa target",
7870 DL.getDebugLoc());
7871 DAG.getContext()->diagnose(BadIntrin);
7872 return DAG.getUNDEF(VT);
7873}
7874
7876 EVT VT) {
7878 "intrinsic not supported on subtarget",
7879 DL.getDebugLoc());
7880 DAG.getContext()->diagnose(BadIntrin);
7881 return DAG.getUNDEF(VT);
7882}
7883
7885 ArrayRef<SDValue> Elts) {
7886 assert(!Elts.empty());
7887 MVT Type;
7888 unsigned NumElts = Elts.size();
7889
7890 if (NumElts <= 12) {
7891 Type = MVT::getVectorVT(MVT::f32, NumElts);
7892 } else {
7893 assert(Elts.size() <= 16);
7894 Type = MVT::v16f32;
7895 NumElts = 16;
7896 }
7897
7898 SmallVector<SDValue, 16> VecElts(NumElts);
7899 for (unsigned i = 0; i < Elts.size(); ++i) {
7900 SDValue Elt = Elts[i];
7901 if (Elt.getValueType() != MVT::f32)
7902 Elt = DAG.getBitcast(MVT::f32, Elt);
7903 VecElts[i] = Elt;
7904 }
7905 for (unsigned i = Elts.size(); i < NumElts; ++i)
7906 VecElts[i] = DAG.getUNDEF(MVT::f32);
7907
7908 if (NumElts == 1)
7909 return VecElts[0];
7910 return DAG.getBuildVector(Type, DL, VecElts);
7911}
7912
7913static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7914 SDValue Src, int ExtraElts) {
7915 EVT SrcVT = Src.getValueType();
7916
7918
7919 if (SrcVT.isVector())
7920 DAG.ExtractVectorElements(Src, Elts);
7921 else
7922 Elts.push_back(Src);
7923
7924 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7925 while (ExtraElts--)
7926 Elts.push_back(Undef);
7927
7928 return DAG.getBuildVector(CastVT, DL, Elts);
7929}
7930
7931// Re-construct the required return value for a image load intrinsic.
7932// This is more complicated due to the optional use TexFailCtrl which means the
7933// required return type is an aggregate
7935 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7936 bool Unpacked, bool IsD16, int DMaskPop,
7937 int NumVDataDwords, bool IsAtomicPacked16Bit,
7938 const SDLoc &DL) {
7939 // Determine the required return type. This is the same regardless of
7940 // IsTexFail flag
7941 EVT ReqRetVT = ResultTypes[0];
7942 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7943 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7944 ? (ReqRetNumElts + 1) / 2
7945 : ReqRetNumElts;
7946
7947 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7948
7949 MVT DataDwordVT =
7950 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7951
7952 MVT MaskPopVT =
7953 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7954
7955 SDValue Data(Result, 0);
7956 SDValue TexFail;
7957
7958 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7959 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7960 if (MaskPopVT.isVector()) {
7961 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7962 SDValue(Result, 0), ZeroIdx);
7963 } else {
7964 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7965 SDValue(Result, 0), ZeroIdx);
7966 }
7967 }
7968
7969 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7970 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7971 NumDataDwords - MaskPopDwords);
7972
7973 if (IsD16)
7974 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7975
7976 EVT LegalReqRetVT = ReqRetVT;
7977 if (!ReqRetVT.isVector()) {
7978 if (!Data.getValueType().isInteger())
7979 Data = DAG.getNode(ISD::BITCAST, DL,
7980 Data.getValueType().changeTypeToInteger(), Data);
7981 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7982 } else {
7983 // We need to widen the return vector to a legal type
7984 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7985 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7986 LegalReqRetVT =
7988 ReqRetVT.getVectorNumElements() + 1);
7989 }
7990 }
7991 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7992
7993 if (IsTexFail) {
7994 TexFail =
7995 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7996 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7997
7998 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7999 }
8000
8001 if (Result->getNumValues() == 1)
8002 return Data;
8003
8004 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8005}
8006
8007static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8008 SDValue *LWE, bool &IsTexFail) {
8009 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8010
8011 uint64_t Value = TexFailCtrlConst->getZExtValue();
8012 if (Value) {
8013 IsTexFail = true;
8014 }
8015
8016 SDLoc DL(TexFailCtrlConst);
8017 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8018 Value &= ~(uint64_t)0x1;
8019 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8020 Value &= ~(uint64_t)0x2;
8021
8022 return Value == 0;
8023}
8024
8026 MVT PackVectorVT,
8027 SmallVectorImpl<SDValue> &PackedAddrs,
8028 unsigned DimIdx, unsigned EndIdx,
8029 unsigned NumGradients) {
8030 SDLoc DL(Op);
8031 for (unsigned I = DimIdx; I < EndIdx; I++) {
8032 SDValue Addr = Op.getOperand(I);
8033
8034 // Gradients are packed with undef for each coordinate.
8035 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8036 // 1D: undef,dx/dh; undef,dx/dv
8037 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8038 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8039 if (((I + 1) >= EndIdx) ||
8040 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8041 I == DimIdx + NumGradients - 1))) {
8042 if (Addr.getValueType() != MVT::i16)
8043 Addr = DAG.getBitcast(MVT::i16, Addr);
8044 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8045 } else {
8046 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8047 I++;
8048 }
8049 Addr = DAG.getBitcast(MVT::f32, Addr);
8050 PackedAddrs.push_back(Addr);
8051 }
8052}
8053
8054SDValue SITargetLowering::lowerImage(SDValue Op,
8056 SelectionDAG &DAG, bool WithChain) const {
8057 SDLoc DL(Op);
8059 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8060 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8062 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8063 unsigned IntrOpcode = Intr->BaseOpcode;
8064 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8065 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8066 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8067
8068 SmallVector<EVT, 3> ResultTypes(Op->values());
8069 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8070 bool IsD16 = false;
8071 bool IsG16 = false;
8072 bool IsA16 = false;
8073 SDValue VData;
8074 int NumVDataDwords = 0;
8075 bool AdjustRetType = false;
8076 bool IsAtomicPacked16Bit = false;
8077
8078 // Offset of intrinsic arguments
8079 const unsigned ArgOffset = WithChain ? 2 : 1;
8080
8081 unsigned DMask;
8082 unsigned DMaskLanes = 0;
8083
8084 if (BaseOpcode->Atomic) {
8085 VData = Op.getOperand(2);
8086
8087 IsAtomicPacked16Bit =
8088 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8089 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8090
8091 bool Is64Bit = VData.getValueSizeInBits() == 64;
8092 if (BaseOpcode->AtomicX2) {
8093 SDValue VData2 = Op.getOperand(3);
8094 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8095 {VData, VData2});
8096 if (Is64Bit)
8097 VData = DAG.getBitcast(MVT::v4i32, VData);
8098
8099 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8100 DMask = Is64Bit ? 0xf : 0x3;
8101 NumVDataDwords = Is64Bit ? 4 : 2;
8102 } else {
8103 DMask = Is64Bit ? 0x3 : 0x1;
8104 NumVDataDwords = Is64Bit ? 2 : 1;
8105 }
8106 } else {
8107 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8108 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8109
8110 if (BaseOpcode->Store) {
8111 VData = Op.getOperand(2);
8112
8113 MVT StoreVT = VData.getSimpleValueType();
8114 if (StoreVT.getScalarType() == MVT::f16) {
8115 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8116 return Op; // D16 is unsupported for this instruction
8117
8118 IsD16 = true;
8119 VData = handleD16VData(VData, DAG, true);
8120 }
8121
8122 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8123 } else if (!BaseOpcode->NoReturn) {
8124 // Work out the num dwords based on the dmask popcount and underlying type
8125 // and whether packing is supported.
8126 MVT LoadVT = ResultTypes[0].getSimpleVT();
8127 if (LoadVT.getScalarType() == MVT::f16) {
8128 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8129 return Op; // D16 is unsupported for this instruction
8130
8131 IsD16 = true;
8132 }
8133
8134 // Confirm that the return type is large enough for the dmask specified
8135 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8136 (!LoadVT.isVector() && DMaskLanes > 1))
8137 return Op;
8138
8139 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8140 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8141 // instructions.
8142 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8143 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8144 NumVDataDwords = (DMaskLanes + 1) / 2;
8145 else
8146 NumVDataDwords = DMaskLanes;
8147
8148 AdjustRetType = true;
8149 }
8150 }
8151
8152 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8154
8155 // Check for 16 bit addresses or derivatives and pack if true.
8156 MVT VAddrVT =
8157 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8158 MVT VAddrScalarVT = VAddrVT.getScalarType();
8159 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8160 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8161
8162 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8163 VAddrScalarVT = VAddrVT.getScalarType();
8164 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8165 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8166
8167 // Push back extra arguments.
8168 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8169 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8170 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8171 // Special handling of bias when A16 is on. Bias is of type half but
8172 // occupies full 32-bit.
8173 SDValue Bias = DAG.getBuildVector(
8174 MVT::v2f16, DL,
8175 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8176 VAddrs.push_back(Bias);
8177 } else {
8178 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8179 "Bias needs to be converted to 16 bit in A16 mode");
8180 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8181 }
8182 }
8183
8184 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8185 // 16 bit gradients are supported, but are tied to the A16 control
8186 // so both gradients and addresses must be 16 bit
8187 LLVM_DEBUG(
8188 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8189 "require 16 bit args for both gradients and addresses");
8190 return Op;
8191 }
8192
8193 if (IsA16) {
8194 if (!ST->hasA16()) {
8195 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8196 "support 16 bit addresses\n");
8197 return Op;
8198 }
8199 }
8200
8201 // We've dealt with incorrect input so we know that if IsA16, IsG16
8202 // are set then we have to compress/pack operands (either address,
8203 // gradient or both)
8204 // In the case where a16 and gradients are tied (no G16 support) then we
8205 // have already verified that both IsA16 and IsG16 are true
8206 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8207 // Activate g16
8208 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8210 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8211 }
8212
8213 // Add gradients (packed or unpacked)
8214 if (IsG16) {
8215 // Pack the gradients
8216 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8217 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8218 ArgOffset + Intr->GradientStart,
8219 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8220 } else {
8221 for (unsigned I = ArgOffset + Intr->GradientStart;
8222 I < ArgOffset + Intr->CoordStart; I++)
8223 VAddrs.push_back(Op.getOperand(I));
8224 }
8225
8226 // Add addresses (packed or unpacked)
8227 if (IsA16) {
8228 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8229 ArgOffset + Intr->CoordStart, VAddrEnd,
8230 0 /* No gradients */);
8231 } else {
8232 // Add uncompressed address
8233 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8234 VAddrs.push_back(Op.getOperand(I));
8235 }
8236
8237 // If the register allocator cannot place the address registers contiguously
8238 // without introducing moves, then using the non-sequential address encoding
8239 // is always preferable, since it saves VALU instructions and is usually a
8240 // wash in terms of code size or even better.
8241 //
8242 // However, we currently have no way of hinting to the register allocator that
8243 // MIMG addresses should be placed contiguously when it is possible to do so,
8244 // so force non-NSA for the common 2-address case as a heuristic.
8245 //
8246 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8247 // allocation when possible.
8248 //
8249 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8250 // set of the remaining addresses.
8251 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8252 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8253 const bool UseNSA = ST->hasNSAEncoding() &&
8254 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8255 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8256 const bool UsePartialNSA =
8257 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8258
8259 SDValue VAddr;
8260 if (UsePartialNSA) {
8261 VAddr = getBuildDwordsVector(DAG, DL,
8262 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8263 } else if (!UseNSA) {
8264 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8265 }
8266
8267 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8268 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8269 SDValue Unorm;
8270 if (!BaseOpcode->Sampler) {
8271 Unorm = True;
8272 } else {
8273 uint64_t UnormConst =
8274 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8275
8276 Unorm = UnormConst ? True : False;
8277 }
8278
8279 SDValue TFE;
8280 SDValue LWE;
8281 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8282 bool IsTexFail = false;
8283 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8284 return Op;
8285
8286 if (IsTexFail) {
8287 if (!DMaskLanes) {
8288 // Expecting to get an error flag since TFC is on - and dmask is 0
8289 // Force dmask to be at least 1 otherwise the instruction will fail
8290 DMask = 0x1;
8291 DMaskLanes = 1;
8292 NumVDataDwords = 1;
8293 }
8294 NumVDataDwords += 1;
8295 AdjustRetType = true;
8296 }
8297
8298 // Has something earlier tagged that the return type needs adjusting
8299 // This happens if the instruction is a load or has set TexFailCtrl flags
8300 if (AdjustRetType) {
8301 // NumVDataDwords reflects the true number of dwords required in the return
8302 // type
8303 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8304 // This is a no-op load. This can be eliminated
8305 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8306 if (isa<MemSDNode>(Op))
8307 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8308 return Undef;
8309 }
8310
8311 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8312 MVT::i32, NumVDataDwords)
8313 : MVT::i32;
8314
8315 ResultTypes[0] = NewVT;
8316 if (ResultTypes.size() == 3) {
8317 // Original result was aggregate type used for TexFailCtrl results
8318 // The actual instruction returns as a vector type which has now been
8319 // created. Remove the aggregate result.
8320 ResultTypes.erase(&ResultTypes[1]);
8321 }
8322 }
8323
8324 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8325 if (BaseOpcode->Atomic)
8326 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8327 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8329 return Op;
8330
8332 if (BaseOpcode->Store || BaseOpcode->Atomic)
8333 Ops.push_back(VData); // vdata
8334 if (UsePartialNSA) {
8335 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8336 Ops.push_back(VAddr);
8337 } else if (UseNSA)
8338 append_range(Ops, VAddrs);
8339 else
8340 Ops.push_back(VAddr);
8341 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8342 EVT RsrcVT = Rsrc.getValueType();
8343 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8344 return Op;
8345 Ops.push_back(Rsrc);
8346 if (BaseOpcode->Sampler) {
8347 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8348 if (Samp.getValueType() != MVT::v4i32)
8349 return Op;
8350 Ops.push_back(Samp);
8351 }
8352 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8353 if (IsGFX10Plus)
8354 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8355 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8356 Ops.push_back(Unorm);
8357 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8358 Ops.push_back(IsA16 && // r128, a16 for gfx9
8359 ST->hasFeature(AMDGPU::FeatureR128A16)
8360 ? True
8361 : False);
8362 if (IsGFX10Plus)
8363 Ops.push_back(IsA16 ? True : False);
8364 if (!Subtarget->hasGFX90AInsts()) {
8365 Ops.push_back(TFE); // tfe
8366 } else if (TFE->getAsZExtVal()) {
8367 report_fatal_error("TFE is not supported on this GPU");
8368 }
8369 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8370 Ops.push_back(LWE); // lwe
8371 if (!IsGFX10Plus)
8372 Ops.push_back(DimInfo->DA ? True : False);
8373 if (BaseOpcode->HasD16)
8374 Ops.push_back(IsD16 ? True : False);
8375 if (isa<MemSDNode>(Op))
8376 Ops.push_back(Op.getOperand(0)); // chain
8377
8378 int NumVAddrDwords =
8379 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8380 int Opcode = -1;
8381
8382 if (IsGFX12Plus) {
8383 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8384 NumVDataDwords, NumVAddrDwords);
8385 } else if (IsGFX11Plus) {
8386 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8387 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8388 : AMDGPU::MIMGEncGfx11Default,
8389 NumVDataDwords, NumVAddrDwords);
8390 } else if (IsGFX10Plus) {
8391 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8392 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8393 : AMDGPU::MIMGEncGfx10Default,
8394 NumVDataDwords, NumVAddrDwords);
8395 } else {
8396 if (Subtarget->hasGFX90AInsts()) {
8397 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8398 NumVDataDwords, NumVAddrDwords);
8399 if (Opcode == -1)
8401 "requested image instruction is not supported on this GPU");
8402 }
8403 if (Opcode == -1 &&
8405 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8406 NumVDataDwords, NumVAddrDwords);
8407 if (Opcode == -1)
8408 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8409 NumVDataDwords, NumVAddrDwords);
8410 }
8411 if (Opcode == -1)
8412 return Op;
8413
8414 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8415 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8416 MachineMemOperand *MemRef = MemOp->getMemOperand();
8417 DAG.setNodeMemRefs(NewNode, {MemRef});
8418 }
8419
8420 if (BaseOpcode->AtomicX2) {
8422 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8423 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8424 }
8425 if (BaseOpcode->NoReturn)
8426 return SDValue(NewNode, 0);
8427 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8428 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8429 NumVDataDwords, IsAtomicPacked16Bit, DL);
8430}
8431
8432SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8433 SDValue Offset, SDValue CachePolicy,
8434 SelectionDAG &DAG) const {
8436
8437 const DataLayout &DataLayout = DAG.getDataLayout();
8438 Align Alignment =
8440
8445 VT.getStoreSize(), Alignment);
8446
8447 if (!Offset->isDivergent()) {
8448 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8449
8450 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8451 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8452 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8453 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8454 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8455 SDValue BufferLoad =
8457 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8458 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8459 }
8460
8461 // Widen vec3 load to vec4.
8462 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8463 !Subtarget->hasScalarDwordx3Loads()) {
8464 EVT WidenedVT =
8466 auto WidenedOp = DAG.getMemIntrinsicNode(
8467 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8468 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8469 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8470 DAG.getVectorIdxConstant(0, DL));
8471 return Subvector;
8472 }
8473
8475 DAG.getVTList(VT), Ops, VT, MMO);
8476 }
8477
8478 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8479 // assume that the buffer is unswizzled.
8480 SDValue Ops[] = {
8481 DAG.getEntryNode(), // Chain
8482 Rsrc, // rsrc
8483 DAG.getConstant(0, DL, MVT::i32), // vindex
8484 {}, // voffset
8485 {}, // soffset
8486 {}, // offset
8487 CachePolicy, // cachepolicy
8488 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8489 };
8490 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8491 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8492 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8493 }
8494
8496 unsigned NumLoads = 1;
8497 MVT LoadVT = VT.getSimpleVT();
8498 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8499 assert((LoadVT.getScalarType() == MVT::i32 ||
8500 LoadVT.getScalarType() == MVT::f32));
8501
8502 if (NumElts == 8 || NumElts == 16) {
8503 NumLoads = NumElts / 4;
8504 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8505 }
8506
8507 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8508
8509 // Use the alignment to ensure that the required offsets will fit into the
8510 // immediate offsets.
8511 setBufferOffsets(Offset, DAG, &Ops[3],
8512 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8513
8514 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8515 for (unsigned i = 0; i < NumLoads; ++i) {
8516 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8517 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8518 LoadVT, MMO, DAG));
8519 }
8520
8521 if (NumElts == 8 || NumElts == 16)
8522 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8523
8524 return Loads[0];
8525}
8526
8527SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8528 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8529 if (!Subtarget->hasArchitectedSGPRs())
8530 return {};
8531 SDLoc SL(Op);
8532 MVT VT = MVT::i32;
8533 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8534 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8535 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8536}
8537
8538SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8539 unsigned Dim,
8540 const ArgDescriptor &Arg) const {
8541 SDLoc SL(Op);
8543 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8544 if (MaxID == 0)
8545 return DAG.getConstant(0, SL, MVT::i32);
8546
8547 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8548 SDLoc(DAG.getEntryNode()), Arg);
8549
8550 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8551 // masking operations anyway.
8552 //
8553 // TODO: We could assert the top bit is 0 for the source copy.
8554 if (Arg.isMasked())
8555 return Val;
8556
8557 // Preserve the known bits after expansion to a copy.
8559 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8560 DAG.getValueType(SmallVT));
8561}
8562
8563SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8564 SelectionDAG &DAG) const {
8566 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8567
8568 EVT VT = Op.getValueType();
8569 SDLoc DL(Op);
8570 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8571
8572 // TODO: Should this propagate fast-math-flags?
8573
8574 switch (IntrinsicID) {
8575 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8576 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8577 return emitNonHSAIntrinsicError(DAG, DL, VT);
8578 return getPreloadedValue(DAG, *MFI, VT,
8580 }
8581 case Intrinsic::amdgcn_dispatch_ptr:
8582 case Intrinsic::amdgcn_queue_ptr: {
8583 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8584 DiagnosticInfoUnsupported BadIntrin(
8585 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8586 DL.getDebugLoc());
8587 DAG.getContext()->diagnose(BadIntrin);
8588 return DAG.getUNDEF(VT);
8589 }
8590
8591 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8594 return getPreloadedValue(DAG, *MFI, VT, RegID);
8595 }
8596 case Intrinsic::amdgcn_implicitarg_ptr: {
8597 if (MFI->isEntryFunction())
8598 return getImplicitArgPtr(DAG, DL);
8599 return getPreloadedValue(DAG, *MFI, VT,
8601 }
8602 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8604 // This only makes sense to call in a kernel, so just lower to null.
8605 return DAG.getConstant(0, DL, VT);
8606 }
8607
8608 return getPreloadedValue(DAG, *MFI, VT,
8610 }
8611 case Intrinsic::amdgcn_dispatch_id: {
8612 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8613 }
8614 case Intrinsic::amdgcn_rcp:
8615 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8616 case Intrinsic::amdgcn_rsq:
8617 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8618 case Intrinsic::amdgcn_rsq_legacy:
8620 return emitRemovedIntrinsicError(DAG, DL, VT);
8621 return SDValue();
8622 case Intrinsic::amdgcn_rcp_legacy:
8624 return emitRemovedIntrinsicError(DAG, DL, VT);
8625 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8626 case Intrinsic::amdgcn_rsq_clamp: {
8628 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8629
8630 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8633
8634 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8635 SDValue Tmp =
8636 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8637 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8638 DAG.getConstantFP(Min, DL, VT));
8639 }
8640 case Intrinsic::r600_read_ngroups_x:
8641 if (Subtarget->isAmdHsaOS())
8642 return emitNonHSAIntrinsicError(DAG, DL, VT);
8643
8644 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8646 false);
8647 case Intrinsic::r600_read_ngroups_y:
8648 if (Subtarget->isAmdHsaOS())
8649 return emitNonHSAIntrinsicError(DAG, DL, VT);
8650
8651 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8653 false);
8654 case Intrinsic::r600_read_ngroups_z:
8655 if (Subtarget->isAmdHsaOS())
8656 return emitNonHSAIntrinsicError(DAG, DL, VT);
8657
8658 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8660 false);
8661 case Intrinsic::r600_read_global_size_x:
8662 if (Subtarget->isAmdHsaOS())
8663 return emitNonHSAIntrinsicError(DAG, DL, VT);
8664
8665 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8667 Align(4), false);
8668 case Intrinsic::r600_read_global_size_y:
8669 if (Subtarget->isAmdHsaOS())
8670 return emitNonHSAIntrinsicError(DAG, DL, VT);
8671
8672 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8674 Align(4), false);
8675 case Intrinsic::r600_read_global_size_z:
8676 if (Subtarget->isAmdHsaOS())
8677 return emitNonHSAIntrinsicError(DAG, DL, VT);
8678
8679 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8681 Align(4), false);
8682 case Intrinsic::r600_read_local_size_x:
8683 if (Subtarget->isAmdHsaOS())
8684 return emitNonHSAIntrinsicError(DAG, DL, VT);
8685
8686 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8688 case Intrinsic::r600_read_local_size_y:
8689 if (Subtarget->isAmdHsaOS())
8690 return emitNonHSAIntrinsicError(DAG, DL, VT);
8691
8692 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8694 case Intrinsic::r600_read_local_size_z:
8695 if (Subtarget->isAmdHsaOS())
8696 return emitNonHSAIntrinsicError(DAG, DL, VT);
8697
8698 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8700 case Intrinsic::amdgcn_workgroup_id_x:
8701 return getPreloadedValue(DAG, *MFI, VT,
8703 case Intrinsic::amdgcn_workgroup_id_y:
8704 return getPreloadedValue(DAG, *MFI, VT,
8706 case Intrinsic::amdgcn_workgroup_id_z:
8707 return getPreloadedValue(DAG, *MFI, VT,
8709 case Intrinsic::amdgcn_wave_id:
8710 return lowerWaveID(DAG, Op);
8711 case Intrinsic::amdgcn_lds_kernel_id: {
8712 if (MFI->isEntryFunction())
8713 return getLDSKernelId(DAG, DL);
8714 return getPreloadedValue(DAG, *MFI, VT,
8716 }
8717 case Intrinsic::amdgcn_workitem_id_x:
8718 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8719 case Intrinsic::amdgcn_workitem_id_y:
8720 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8721 case Intrinsic::amdgcn_workitem_id_z:
8722 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8723 case Intrinsic::amdgcn_wavefrontsize:
8725 SDLoc(Op), MVT::i32);
8726 case Intrinsic::amdgcn_s_buffer_load: {
8727 unsigned CPol = Op.getConstantOperandVal(3);
8728 // s_buffer_load, because of how it's optimized, can't be volatile
8729 // so reject ones with the volatile bit set.
8730 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8733 return Op;
8734 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8735 Op.getOperand(3), DAG);
8736 }
8737 case Intrinsic::amdgcn_fdiv_fast:
8738 return lowerFDIV_FAST(Op, DAG);
8739 case Intrinsic::amdgcn_sin:
8740 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8741
8742 case Intrinsic::amdgcn_cos:
8743 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8744
8745 case Intrinsic::amdgcn_mul_u24:
8746 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8747 Op.getOperand(2));
8748 case Intrinsic::amdgcn_mul_i24:
8749 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8750 Op.getOperand(2));
8751
8752 case Intrinsic::amdgcn_log_clamp: {
8754 return SDValue();
8755
8756 return emitRemovedIntrinsicError(DAG, DL, VT);
8757 }
8758 case Intrinsic::amdgcn_fract:
8759 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8760
8761 case Intrinsic::amdgcn_class:
8762 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8763 Op.getOperand(2));
8764 case Intrinsic::amdgcn_div_fmas:
8765 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8766 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8767
8768 case Intrinsic::amdgcn_div_fixup:
8769 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8770 Op.getOperand(2), Op.getOperand(3));
8771
8772 case Intrinsic::amdgcn_div_scale: {
8773 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8774
8775 // Translate to the operands expected by the machine instruction. The
8776 // first parameter must be the same as the first instruction.
8777 SDValue Numerator = Op.getOperand(1);
8778 SDValue Denominator = Op.getOperand(2);
8779
8780 // Note this order is opposite of the machine instruction's operations,
8781 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8782 // intrinsic has the numerator as the first operand to match a normal
8783 // division operation.
8784
8785 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8786
8787 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8788 Denominator, Numerator);
8789 }
8790 case Intrinsic::amdgcn_icmp: {
8791 // There is a Pat that handles this variant, so return it as-is.
8792 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8793 Op.getConstantOperandVal(2) == 0 &&
8794 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8795 return Op;
8796 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8797 }
8798 case Intrinsic::amdgcn_fcmp: {
8799 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8800 }
8801 case Intrinsic::amdgcn_ballot:
8802 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8803 case Intrinsic::amdgcn_fmed3:
8804 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8805 Op.getOperand(2), Op.getOperand(3));
8806 case Intrinsic::amdgcn_fdot2:
8807 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8808 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8809 case Intrinsic::amdgcn_fmul_legacy:
8810 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8811 Op.getOperand(2));
8812 case Intrinsic::amdgcn_sffbh:
8813 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8814 case Intrinsic::amdgcn_sbfe:
8815 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8816 Op.getOperand(2), Op.getOperand(3));
8817 case Intrinsic::amdgcn_ubfe:
8818 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8819 Op.getOperand(2), Op.getOperand(3));
8820 case Intrinsic::amdgcn_cvt_pkrtz:
8821 case Intrinsic::amdgcn_cvt_pknorm_i16:
8822 case Intrinsic::amdgcn_cvt_pknorm_u16:
8823 case Intrinsic::amdgcn_cvt_pk_i16:
8824 case Intrinsic::amdgcn_cvt_pk_u16: {
8825 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8826 EVT VT = Op.getValueType();
8827 unsigned Opcode;
8828
8829 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8831 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8833 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8835 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8837 else
8839
8840 if (isTypeLegal(VT))
8841 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8842
8843 SDValue Node =
8844 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8845 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8846 }
8847 case Intrinsic::amdgcn_fmad_ftz:
8848 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8849 Op.getOperand(2), Op.getOperand(3));
8850
8851 case Intrinsic::amdgcn_if_break:
8852 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8853 Op->getOperand(1), Op->getOperand(2)),
8854 0);
8855
8856 case Intrinsic::amdgcn_groupstaticsize: {
8858 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8859 return Op;
8860
8861 const Module *M = MF.getFunction().getParent();
8862 const GlobalValue *GV =
8863 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8864 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8866 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8867 }
8868 case Intrinsic::amdgcn_is_shared:
8869 case Intrinsic::amdgcn_is_private: {
8870 SDLoc SL(Op);
8871 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8874 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8875 SDValue SrcVec =
8876 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8877
8878 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8879 DAG.getConstant(1, SL, MVT::i32));
8880 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8881 }
8882 case Intrinsic::amdgcn_perm:
8883 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8884 Op.getOperand(2), Op.getOperand(3));
8885 case Intrinsic::amdgcn_reloc_constant: {
8886 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8887 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8888 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8889 auto *RelocSymbol = cast<GlobalVariable>(
8890 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8891 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8893 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8894 }
8895 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8896 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8897 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8898 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8899 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8900 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8901 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8902 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8903 if (Op.getOperand(4).getValueType() == MVT::i32)
8904 return SDValue();
8905
8906 SDLoc SL(Op);
8907 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8908 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8909 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8910 Op.getOperand(3), IndexKeyi32);
8911 }
8912 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8913 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8914 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8915 if (Op.getOperand(6).getValueType() == MVT::i32)
8916 return SDValue();
8917
8918 SDLoc SL(Op);
8919 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8920 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8921 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8922 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8923 IndexKeyi32, Op.getOperand(7)});
8924 }
8925 case Intrinsic::amdgcn_addrspacecast_nonnull:
8926 return lowerADDRSPACECAST(Op, DAG);
8927 case Intrinsic::amdgcn_readlane:
8928 case Intrinsic::amdgcn_readfirstlane:
8929 case Intrinsic::amdgcn_writelane:
8930 case Intrinsic::amdgcn_permlane16:
8931 case Intrinsic::amdgcn_permlanex16:
8932 case Intrinsic::amdgcn_permlane64:
8933 case Intrinsic::amdgcn_set_inactive:
8934 case Intrinsic::amdgcn_set_inactive_chain_arg:
8935 case Intrinsic::amdgcn_mov_dpp8:
8936 case Intrinsic::amdgcn_update_dpp:
8937 return lowerLaneOp(*this, Op.getNode(), DAG);
8938 default:
8939 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8941 return lowerImage(Op, ImageDimIntr, DAG, false);
8942
8943 return Op;
8944 }
8945}
8946
8947// On targets not supporting constant in soffset field, turn zero to
8948// SGPR_NULL to avoid generating an extra s_mov with zero.
8950 const GCNSubtarget *Subtarget) {
8951 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8952 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8953 return SOffset;
8954}
8955
8956SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8957 SelectionDAG &DAG,
8958 unsigned NewOpcode) const {
8959 SDLoc DL(Op);
8960
8961 SDValue VData = Op.getOperand(2);
8962 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8963 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
8964 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8965 SDValue Ops[] = {
8966 Op.getOperand(0), // Chain
8967 VData, // vdata
8968 Rsrc, // rsrc
8969 DAG.getConstant(0, DL, MVT::i32), // vindex
8970 VOffset, // voffset
8971 SOffset, // soffset
8972 Offset, // offset
8973 Op.getOperand(6), // cachepolicy
8974 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8975 };
8976
8977 auto *M = cast<MemSDNode>(Op);
8978
8979 EVT MemVT = VData.getValueType();
8980 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8981 M->getMemOperand());
8982}
8983
8984SDValue
8985SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8986 unsigned NewOpcode) const {
8987 SDLoc DL(Op);
8988
8989 SDValue VData = Op.getOperand(2);
8990 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8991 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
8992 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8993 SDValue Ops[] = {
8994 Op.getOperand(0), // Chain
8995 VData, // vdata
8996 Rsrc, // rsrc
8997 Op.getOperand(4), // vindex
8998 VOffset, // voffset
8999 SOffset, // soffset
9000 Offset, // offset
9001 Op.getOperand(7), // cachepolicy
9002 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9003 };
9004
9005 auto *M = cast<MemSDNode>(Op);
9006
9007 EVT MemVT = VData.getValueType();
9008 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9009 M->getMemOperand());
9010}
9011
9012SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9013 SelectionDAG &DAG) const {
9014 unsigned IntrID = Op.getConstantOperandVal(1);
9015 SDLoc DL(Op);
9016
9017 switch (IntrID) {
9018 case Intrinsic::amdgcn_ds_ordered_add:
9019 case Intrinsic::amdgcn_ds_ordered_swap: {
9020 MemSDNode *M = cast<MemSDNode>(Op);
9021 SDValue Chain = M->getOperand(0);
9022 SDValue M0 = M->getOperand(2);
9023 SDValue Value = M->getOperand(3);
9024 unsigned IndexOperand = M->getConstantOperandVal(7);
9025 unsigned WaveRelease = M->getConstantOperandVal(8);
9026 unsigned WaveDone = M->getConstantOperandVal(9);
9027
9028 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9029 IndexOperand &= ~0x3f;
9030 unsigned CountDw = 0;
9031
9032 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9033 CountDw = (IndexOperand >> 24) & 0xf;
9034 IndexOperand &= ~(0xf << 24);
9035
9036 if (CountDw < 1 || CountDw > 4) {
9038 "ds_ordered_count: dword count must be between 1 and 4");
9039 }
9040 }
9041
9042 if (IndexOperand)
9043 report_fatal_error("ds_ordered_count: bad index operand");
9044
9045 if (WaveDone && !WaveRelease)
9046 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9047
9048 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9049 unsigned ShaderType =
9051 unsigned Offset0 = OrderedCountIndex << 2;
9052 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9053
9054 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9055 Offset1 |= (CountDw - 1) << 6;
9056
9057 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9058 Offset1 |= ShaderType << 2;
9059
9060 unsigned Offset = Offset0 | (Offset1 << 8);
9061
9062 SDValue Ops[] = {
9063 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9064 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9065 };
9067 M->getVTList(), Ops, M->getMemoryVT(),
9068 M->getMemOperand());
9069 }
9070 case Intrinsic::amdgcn_raw_buffer_load:
9071 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9072 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9073 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9074 case Intrinsic::amdgcn_raw_buffer_load_format:
9075 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9076 const bool IsFormat =
9077 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9078 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9079
9080 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9081 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9082 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9083 SDValue Ops[] = {
9084 Op.getOperand(0), // Chain
9085 Rsrc, // rsrc
9086 DAG.getConstant(0, DL, MVT::i32), // vindex
9087 VOffset, // voffset
9088 SOffset, // soffset
9089 Offset, // offset
9090 Op.getOperand(5), // cachepolicy, swizzled buffer
9091 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9092 };
9093
9094 auto *M = cast<MemSDNode>(Op);
9095 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9096 }
9097 case Intrinsic::amdgcn_struct_buffer_load:
9098 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9099 case Intrinsic::amdgcn_struct_buffer_load_format:
9100 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9101 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9102 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9103 const bool IsFormat =
9104 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9105 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9106
9107 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9108 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9109 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9110 SDValue Ops[] = {
9111 Op.getOperand(0), // Chain
9112 Rsrc, // rsrc
9113 Op.getOperand(3), // vindex
9114 VOffset, // voffset
9115 SOffset, // soffset
9116 Offset, // offset
9117 Op.getOperand(6), // cachepolicy, swizzled buffer
9118 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9119 };
9120
9121 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9122 }
9123 case Intrinsic::amdgcn_raw_tbuffer_load:
9124 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9125 MemSDNode *M = cast<MemSDNode>(Op);
9126 EVT LoadVT = Op.getValueType();
9127 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9128 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9129 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9130
9131 SDValue Ops[] = {
9132 Op.getOperand(0), // Chain
9133 Rsrc, // rsrc
9134 DAG.getConstant(0, DL, MVT::i32), // vindex
9135 VOffset, // voffset
9136 SOffset, // soffset
9137 Offset, // offset
9138 Op.getOperand(5), // format
9139 Op.getOperand(6), // cachepolicy, swizzled buffer
9140 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9141 };
9142
9143 if (LoadVT.getScalarType() == MVT::f16)
9144 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9145 Ops);
9146 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9147 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9148 DAG);
9149 }
9150 case Intrinsic::amdgcn_struct_tbuffer_load:
9151 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9152 MemSDNode *M = cast<MemSDNode>(Op);
9153 EVT LoadVT = Op.getValueType();
9154 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9155 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9156 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9157
9158 SDValue Ops[] = {
9159 Op.getOperand(0), // Chain
9160 Rsrc, // rsrc
9161 Op.getOperand(3), // vindex
9162 VOffset, // voffset
9163 SOffset, // soffset
9164 Offset, // offset
9165 Op.getOperand(6), // format
9166 Op.getOperand(7), // cachepolicy, swizzled buffer
9167 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9168 };
9169
9170 if (LoadVT.getScalarType() == MVT::f16)
9171 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9172 Ops);
9173 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9174 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9175 DAG);
9176 }
9177 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9178 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9179 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9180 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9182 return lowerStructBufferAtomicIntrin(Op, DAG,
9184 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9186 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9187 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9188 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9189 return lowerStructBufferAtomicIntrin(Op, DAG,
9191 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9192 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9193 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9194 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9195 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9196 return lowerStructBufferAtomicIntrin(Op, DAG,
9198 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9200 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9201 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9202 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9203 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9204 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9205 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9206 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9207 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9208 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9209 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9210 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9212 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9213 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9214 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9215 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9216 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9217 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9218 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9219 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9221 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9222 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9224 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9225 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9227 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9228 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9229 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9230 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9231 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9232 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9233 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9234 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9235 return lowerRawBufferAtomicIntrin(Op, DAG,
9237 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9239 return lowerStructBufferAtomicIntrin(Op, DAG,
9241 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9243 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9244 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9246 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9247 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9248 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9249 return lowerStructBufferAtomicIntrin(Op, DAG,
9251 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9252 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9253 return lowerStructBufferAtomicIntrin(Op, DAG,
9255 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9257 return lowerStructBufferAtomicIntrin(Op, DAG,
9259 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9260 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9261 return lowerStructBufferAtomicIntrin(Op, DAG,
9263 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9264 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9265 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9266 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9268 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9269 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9271 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9272 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9274 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9275 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9276 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9277 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9278 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9279 return lowerStructBufferAtomicIntrin(Op, DAG,
9281
9282 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9284 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9285 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9286 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9287 SDValue Ops[] = {
9288 Op.getOperand(0), // Chain
9289 Op.getOperand(2), // src
9290 Op.getOperand(3), // cmp
9291 Rsrc, // rsrc
9292 DAG.getConstant(0, DL, MVT::i32), // vindex
9293 VOffset, // voffset
9294 SOffset, // soffset
9295 Offset, // offset
9296 Op.getOperand(7), // cachepolicy
9297 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9298 };
9299 EVT VT = Op.getValueType();
9300 auto *M = cast<MemSDNode>(Op);
9301
9303 Op->getVTList(), Ops, VT,
9304 M->getMemOperand());
9305 }
9306 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9308 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9309 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9310 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9311 SDValue Ops[] = {
9312 Op.getOperand(0), // Chain
9313 Op.getOperand(2), // src
9314 Op.getOperand(3), // cmp
9315 Rsrc, // rsrc
9316 Op.getOperand(5), // vindex
9317 VOffset, // voffset
9318 SOffset, // soffset
9319 Offset, // offset
9320 Op.getOperand(8), // cachepolicy
9321 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9322 };
9323 EVT VT = Op.getValueType();
9324 auto *M = cast<MemSDNode>(Op);
9325
9327 Op->getVTList(), Ops, VT,
9328 M->getMemOperand());
9329 }
9330 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9331 MemSDNode *M = cast<MemSDNode>(Op);
9332 SDValue NodePtr = M->getOperand(2);
9333 SDValue RayExtent = M->getOperand(3);
9334 SDValue RayOrigin = M->getOperand(4);
9335 SDValue RayDir = M->getOperand(5);
9336 SDValue RayInvDir = M->getOperand(6);
9337 SDValue TDescr = M->getOperand(7);
9338
9339 assert(NodePtr.getValueType() == MVT::i32 ||
9340 NodePtr.getValueType() == MVT::i64);
9341 assert(RayDir.getValueType() == MVT::v3f16 ||
9342 RayDir.getValueType() == MVT::v3f32);
9343
9344 if (!Subtarget->hasGFX10_AEncoding()) {
9345 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9346 return SDValue();
9347 }
9348
9349 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9350 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9351 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9352 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9353 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9354 const unsigned NumVDataDwords = 4;
9355 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9356 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9357 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9358 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9359 IsGFX12Plus;
9360 const unsigned BaseOpcodes[2][2] = {
9361 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9362 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9363 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9364 int Opcode;
9365 if (UseNSA) {
9366 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9367 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9368 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9369 : AMDGPU::MIMGEncGfx10NSA,
9370 NumVDataDwords, NumVAddrDwords);
9371 } else {
9372 assert(!IsGFX12Plus);
9373 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9374 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9375 : AMDGPU::MIMGEncGfx10Default,
9376 NumVDataDwords, NumVAddrDwords);
9377 }
9378 assert(Opcode != -1);
9379
9381
9382 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9384 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9385 if (Lanes[0].getValueSizeInBits() == 32) {
9386 for (unsigned I = 0; I < 3; ++I)
9387 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9388 } else {
9389 if (IsAligned) {
9390 Ops.push_back(DAG.getBitcast(
9391 MVT::i32,
9392 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9393 Ops.push_back(Lanes[2]);
9394 } else {
9395 SDValue Elt0 = Ops.pop_back_val();
9396 Ops.push_back(DAG.getBitcast(
9397 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9398 Ops.push_back(DAG.getBitcast(
9399 MVT::i32,
9400 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9401 }
9402 }
9403 };
9404
9405 if (UseNSA && IsGFX11Plus) {
9406 Ops.push_back(NodePtr);
9407 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9408 Ops.push_back(RayOrigin);
9409 if (IsA16) {
9410 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9411 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9412 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9413 for (unsigned I = 0; I < 3; ++I) {
9414 MergedLanes.push_back(DAG.getBitcast(
9415 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9416 {DirLanes[I], InvDirLanes[I]})));
9417 }
9418 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9419 } else {
9420 Ops.push_back(RayDir);
9421 Ops.push_back(RayInvDir);
9422 }
9423 } else {
9424 if (Is64)
9425 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9426 2);
9427 else
9428 Ops.push_back(NodePtr);
9429
9430 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9431 packLanes(RayOrigin, true);
9432 packLanes(RayDir, true);
9433 packLanes(RayInvDir, false);
9434 }
9435
9436 if (!UseNSA) {
9437 // Build a single vector containing all the operands so far prepared.
9438 if (NumVAddrDwords > 12) {
9439 SDValue Undef = DAG.getUNDEF(MVT::i32);
9440 Ops.append(16 - Ops.size(), Undef);
9441 }
9442 assert(Ops.size() >= 8 && Ops.size() <= 12);
9443 SDValue MergedOps =
9444 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9445 Ops.clear();
9446 Ops.push_back(MergedOps);
9447 }
9448
9449 Ops.push_back(TDescr);
9450 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9451 Ops.push_back(M->getChain());
9452
9453 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9454 MachineMemOperand *MemRef = M->getMemOperand();
9455 DAG.setNodeMemRefs(NewNode, {MemRef});
9456 return SDValue(NewNode, 0);
9457 }
9458 case Intrinsic::amdgcn_global_atomic_fmin_num:
9459 case Intrinsic::amdgcn_global_atomic_fmax_num:
9460 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9461 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9462 MemSDNode *M = cast<MemSDNode>(Op);
9463 SDValue Ops[] = {
9464 M->getOperand(0), // Chain
9465 M->getOperand(2), // Ptr
9466 M->getOperand(3) // Value
9467 };
9468 unsigned Opcode = 0;
9469 switch (IntrID) {
9470 case Intrinsic::amdgcn_global_atomic_fmin_num:
9471 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9472 Opcode = ISD::ATOMIC_LOAD_FMIN;
9473 break;
9474 }
9475 case Intrinsic::amdgcn_global_atomic_fmax_num:
9476 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9477 Opcode = ISD::ATOMIC_LOAD_FMAX;
9478 break;
9479 }
9480 default:
9481 llvm_unreachable("unhandled atomic opcode");
9482 }
9483 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9484 Ops, M->getMemOperand());
9485 }
9486 case Intrinsic::amdgcn_s_get_barrier_state:
9487 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9488 SDValue Chain = Op->getOperand(0);
9490 unsigned Opc;
9491
9492 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9493 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9494 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9495 BarID = (BarID >> 4) & 0x3F;
9496 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9497 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9498 Ops.push_back(K);
9499 Ops.push_back(Chain);
9500 } else {
9501 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9502 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9503 SDValue M0Val;
9504 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9505 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9506 M0Val = SDValue(
9507 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9508 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9509 0);
9510 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9511 } else
9512 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9513 }
9514
9515 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9516 return SDValue(NewMI, 0);
9517 }
9518 default:
9519
9520 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9522 return lowerImage(Op, ImageDimIntr, DAG, true);
9523
9524 return SDValue();
9525 }
9526}
9527
9528// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9529// dwordx4 if on SI and handle TFE loads.
9530SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9531 SDVTList VTList,
9532 ArrayRef<SDValue> Ops, EVT MemVT,
9533 MachineMemOperand *MMO,
9534 SelectionDAG &DAG) const {
9535 LLVMContext &C = *DAG.getContext();
9537 EVT VT = VTList.VTs[0];
9538
9539 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9540 bool IsTFE = VTList.NumVTs == 3;
9541 if (IsTFE) {
9542 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9543 unsigned NumOpDWords = NumValueDWords + 1;
9544 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9545 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9546 MachineMemOperand *OpDWordsMMO =
9547 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9548 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9549 OpDWordsVT, OpDWordsMMO, DAG);
9551 DAG.getVectorIdxConstant(NumValueDWords, DL));
9552 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9553 SDValue ValueDWords =
9554 NumValueDWords == 1
9555 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9557 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9558 ZeroIdx);
9559 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9560 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9561 }
9562
9563 if (!Subtarget->hasDwordx3LoadStores() &&
9564 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9565 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9566 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9567 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9568 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9569 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9570 WidenedMemVT, WidenedMMO);
9572 DAG.getVectorIdxConstant(0, DL));
9573 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9574 }
9575
9576 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9577}
9578
9579SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9580 bool ImageStore) const {
9581 EVT StoreVT = VData.getValueType();
9582
9583 // No change for f16 and legal vector D16 types.
9584 if (!StoreVT.isVector())
9585 return VData;
9586
9587 SDLoc DL(VData);
9588 unsigned NumElements = StoreVT.getVectorNumElements();
9589
9590 if (Subtarget->hasUnpackedD16VMem()) {
9591 // We need to unpack the packed data to store.
9592 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9593 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9594
9595 EVT EquivStoreVT =
9596 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9597 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9598 return DAG.UnrollVectorOp(ZExt.getNode());
9599 }
9600
9601 // The sq block of gfx8.1 does not estimate register use correctly for d16
9602 // image store instructions. The data operand is computed as if it were not a
9603 // d16 image instruction.
9604 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9605 // Bitcast to i16
9606 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9607 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9608
9609 // Decompose into scalars
9611 DAG.ExtractVectorElements(IntVData, Elts);
9612
9613 // Group pairs of i16 into v2i16 and bitcast to i32
9614 SmallVector<SDValue, 4> PackedElts;
9615 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9616 SDValue Pair =
9617 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9618 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9619 PackedElts.push_back(IntPair);
9620 }
9621 if ((NumElements % 2) == 1) {
9622 // Handle v3i16
9623 unsigned I = Elts.size() / 2;
9624 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9625 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9626 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9627 PackedElts.push_back(IntPair);
9628 }
9629
9630 // Pad using UNDEF
9631 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9632
9633 // Build final vector
9634 EVT VecVT =
9635 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9636 return DAG.getBuildVector(VecVT, DL, PackedElts);
9637 }
9638
9639 if (NumElements == 3) {
9640 EVT IntStoreVT =
9642 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9643
9644 EVT WidenedStoreVT = EVT::getVectorVT(
9645 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9646 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9647 WidenedStoreVT.getStoreSizeInBits());
9648 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9649 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9650 }
9651
9652 assert(isTypeLegal(StoreVT));
9653 return VData;
9654}
9655
9656SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9657 SelectionDAG &DAG) const {
9658 SDLoc DL(Op);
9659 SDValue Chain = Op.getOperand(0);
9660 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9662
9663 switch (IntrinsicID) {
9664 case Intrinsic::amdgcn_exp_compr: {
9665 if (!Subtarget->hasCompressedExport()) {
9666 DiagnosticInfoUnsupported BadIntrin(
9668 "intrinsic not supported on subtarget", DL.getDebugLoc());
9669 DAG.getContext()->diagnose(BadIntrin);
9670 }
9671 SDValue Src0 = Op.getOperand(4);
9672 SDValue Src1 = Op.getOperand(5);
9673 // Hack around illegal type on SI by directly selecting it.
9674 if (isTypeLegal(Src0.getValueType()))
9675 return SDValue();
9676
9677 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9678 SDValue Undef = DAG.getUNDEF(MVT::f32);
9679 const SDValue Ops[] = {
9680 Op.getOperand(2), // tgt
9681 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9682 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9683 Undef, // src2
9684 Undef, // src3
9685 Op.getOperand(7), // vm
9686 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9687 Op.getOperand(3), // en
9688 Op.getOperand(0) // Chain
9689 };
9690
9691 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9692 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9693 }
9694 case Intrinsic::amdgcn_s_barrier:
9695 case Intrinsic::amdgcn_s_barrier_signal:
9696 case Intrinsic::amdgcn_s_barrier_wait: {
9699 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9700 if (WGSize <= ST.getWavefrontSize()) {
9701 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9702 // s_barrier/s_barrier_wait to wave_barrier.
9703 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9704 return Op.getOperand(0);
9705 else
9706 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9707 MVT::Other, Op.getOperand(0)),
9708 0);
9709 }
9710 }
9711
9712 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9713 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9714 SDValue K =
9716 SDValue BarSignal =
9717 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9718 MVT::Other, K, Op.getOperand(0)),
9719 0);
9720 SDValue BarWait =
9721 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9722 BarSignal.getValue(0)),
9723 0);
9724 return BarWait;
9725 }
9726
9727 return SDValue();
9728 };
9729
9730 case Intrinsic::amdgcn_struct_tbuffer_store:
9731 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9732 SDValue VData = Op.getOperand(2);
9733 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9734 if (IsD16)
9735 VData = handleD16VData(VData, DAG);
9736 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9737 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9738 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9739 SDValue Ops[] = {
9740 Chain,
9741 VData, // vdata
9742 Rsrc, // rsrc
9743 Op.getOperand(4), // vindex
9744 VOffset, // voffset
9745 SOffset, // soffset
9746 Offset, // offset
9747 Op.getOperand(7), // format
9748 Op.getOperand(8), // cachepolicy, swizzled buffer
9749 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9750 };
9751 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9753 MemSDNode *M = cast<MemSDNode>(Op);
9754 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9755 M->getMemoryVT(), M->getMemOperand());
9756 }
9757
9758 case Intrinsic::amdgcn_raw_tbuffer_store:
9759 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9760 SDValue VData = Op.getOperand(2);
9761 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9762 if (IsD16)
9763 VData = handleD16VData(VData, DAG);
9764 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9765 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9766 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9767 SDValue Ops[] = {
9768 Chain,
9769 VData, // vdata
9770 Rsrc, // rsrc
9771 DAG.getConstant(0, DL, MVT::i32), // vindex
9772 VOffset, // voffset
9773 SOffset, // soffset
9774 Offset, // offset
9775 Op.getOperand(6), // format
9776 Op.getOperand(7), // cachepolicy, swizzled buffer
9777 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9778 };
9779 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9781 MemSDNode *M = cast<MemSDNode>(Op);
9782 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9783 M->getMemoryVT(), M->getMemOperand());
9784 }
9785
9786 case Intrinsic::amdgcn_raw_buffer_store:
9787 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9788 case Intrinsic::amdgcn_raw_buffer_store_format:
9789 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9790 const bool IsFormat =
9791 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9792 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9793
9794 SDValue VData = Op.getOperand(2);
9795 EVT VDataVT = VData.getValueType();
9796 EVT EltType = VDataVT.getScalarType();
9797 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9798 if (IsD16) {
9799 VData = handleD16VData(VData, DAG);
9800 VDataVT = VData.getValueType();
9801 }
9802
9803 if (!isTypeLegal(VDataVT)) {
9804 VData =
9805 DAG.getNode(ISD::BITCAST, DL,
9806 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9807 }
9808
9809 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9810 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9811 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9812 SDValue Ops[] = {
9813 Chain,
9814 VData,
9815 Rsrc,
9816 DAG.getConstant(0, DL, MVT::i32), // vindex
9817 VOffset, // voffset
9818 SOffset, // soffset
9819 Offset, // offset
9820 Op.getOperand(6), // cachepolicy, swizzled buffer
9821 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9822 };
9823 unsigned Opc =
9825 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9826 MemSDNode *M = cast<MemSDNode>(Op);
9827
9828 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9829 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9830 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9831
9832 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9833 M->getMemoryVT(), M->getMemOperand());
9834 }
9835
9836 case Intrinsic::amdgcn_struct_buffer_store:
9837 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9838 case Intrinsic::amdgcn_struct_buffer_store_format:
9839 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9840 const bool IsFormat =
9841 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9842 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9843
9844 SDValue VData = Op.getOperand(2);
9845 EVT VDataVT = VData.getValueType();
9846 EVT EltType = VDataVT.getScalarType();
9847 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9848
9849 if (IsD16) {
9850 VData = handleD16VData(VData, DAG);
9851 VDataVT = VData.getValueType();
9852 }
9853
9854 if (!isTypeLegal(VDataVT)) {
9855 VData =
9856 DAG.getNode(ISD::BITCAST, DL,
9857 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9858 }
9859
9860 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9861 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9862 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9863 SDValue Ops[] = {
9864 Chain,
9865 VData,
9866 Rsrc,
9867 Op.getOperand(4), // vindex
9868 VOffset, // voffset
9869 SOffset, // soffset
9870 Offset, // offset
9871 Op.getOperand(7), // cachepolicy, swizzled buffer
9872 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9873 };
9874 unsigned Opc =
9876 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9877 MemSDNode *M = cast<MemSDNode>(Op);
9878
9879 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9880 EVT VDataType = VData.getValueType().getScalarType();
9881 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9882 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9883
9884 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9885 M->getMemoryVT(), M->getMemOperand());
9886 }
9887 case Intrinsic::amdgcn_raw_buffer_load_lds:
9888 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9889 case Intrinsic::amdgcn_struct_buffer_load_lds:
9890 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9891 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9892 unsigned Opc;
9893 bool HasVIndex =
9894 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9895 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9896 unsigned OpOffset = HasVIndex ? 1 : 0;
9897 SDValue VOffset = Op.getOperand(5 + OpOffset);
9898 bool HasVOffset = !isNullConstant(VOffset);
9899 unsigned Size = Op->getConstantOperandVal(4);
9900
9901 switch (Size) {
9902 default:
9903 return SDValue();
9904 case 1:
9905 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9906 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9907 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9908 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9909 break;
9910 case 2:
9911 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9912 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9913 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9914 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9915 break;
9916 case 4:
9917 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9918 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9919 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9920 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9921 break;
9922 case 12:
9923 if (!Subtarget->hasLDSLoadB96_B128())
9924 return SDValue();
9925 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9926 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9927 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9928 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9929 break;
9930 case 16:
9931 if (!Subtarget->hasLDSLoadB96_B128())
9932 return SDValue();
9933 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9934 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9935 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9936 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9937 break;
9938 }
9939
9940 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9941
9943
9944 if (HasVIndex && HasVOffset)
9945 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9946 {Op.getOperand(5), // VIndex
9947 VOffset}));
9948 else if (HasVIndex)
9949 Ops.push_back(Op.getOperand(5));
9950 else if (HasVOffset)
9951 Ops.push_back(VOffset);
9952
9953 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9954 Ops.push_back(Rsrc);
9955 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9956 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9957 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9958 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9960 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9961 DL, MVT::i8)); // cpol
9963 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9964 ? 1
9965 : 0,
9966 DL, MVT::i8)); // swz
9967 Ops.push_back(M0Val.getValue(0)); // Chain
9968 Ops.push_back(M0Val.getValue(1)); // Glue
9969
9970 auto *M = cast<MemSDNode>(Op);
9971 MachineMemOperand *LoadMMO = M->getMemOperand();
9972 // Don't set the offset value here because the pointer points to the base of
9973 // the buffer.
9974 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9975
9976 MachinePointerInfo StorePtrI = LoadPtrI;
9977 LoadPtrI.V = PoisonValue::get(
9981
9982 auto F = LoadMMO->getFlags() &
9984 LoadMMO =
9986 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9987
9989 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9990 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9991
9992 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9993 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9994
9995 return SDValue(Load, 0);
9996 }
9997 case Intrinsic::amdgcn_global_load_lds: {
9998 unsigned Opc;
9999 unsigned Size = Op->getConstantOperandVal(4);
10000 switch (Size) {
10001 default:
10002 return SDValue();
10003 case 1:
10004 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10005 break;
10006 case 2:
10007 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10008 break;
10009 case 4:
10010 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10011 break;
10012 case 12:
10013 if (!Subtarget->hasLDSLoadB96_B128())
10014 return SDValue();
10015 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10016 break;
10017 case 16:
10018 if (!Subtarget->hasLDSLoadB96_B128())
10019 return SDValue();
10020 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10021 break;
10022 }
10023
10024 auto *M = cast<MemSDNode>(Op);
10025 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10026
10028
10029 SDValue Addr = Op.getOperand(2); // Global ptr
10030 SDValue VOffset;
10031 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10032 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10033 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10034 SDValue LHS = Addr.getOperand(0);
10035 SDValue RHS = Addr.getOperand(1);
10036
10037 if (LHS->isDivergent())
10038 std::swap(LHS, RHS);
10039
10040 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10041 RHS.getOperand(0).getValueType() == MVT::i32) {
10042 // add (i64 sgpr), (zero_extend (i32 vgpr))
10043 Addr = LHS;
10044 VOffset = RHS.getOperand(0);
10045 }
10046 }
10047
10048 Ops.push_back(Addr);
10049 if (!Addr->isDivergent()) {
10050 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10051 if (!VOffset)
10052 VOffset =
10053 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10054 DAG.getTargetConstant(0, DL, MVT::i32)),
10055 0);
10056 Ops.push_back(VOffset);
10057 }
10058
10059 Ops.push_back(Op.getOperand(5)); // Offset
10060 Ops.push_back(Op.getOperand(6)); // CPol
10061 Ops.push_back(M0Val.getValue(0)); // Chain
10062 Ops.push_back(M0Val.getValue(1)); // Glue
10063
10064 MachineMemOperand *LoadMMO = M->getMemOperand();
10065 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10066 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10067 MachinePointerInfo StorePtrI = LoadPtrI;
10068 LoadPtrI.V = PoisonValue::get(
10072 auto F = LoadMMO->getFlags() &
10074 LoadMMO =
10076 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10078 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10079 LoadMMO->getAAInfo());
10080
10081 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10082 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10083
10084 return SDValue(Load, 0);
10085 }
10086 case Intrinsic::amdgcn_end_cf:
10087 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10088 Op->getOperand(2), Chain),
10089 0);
10090 case Intrinsic::amdgcn_s_barrier_init:
10091 case Intrinsic::amdgcn_s_barrier_signal_var: {
10092 // these two intrinsics have two operands: barrier pointer and member count
10093 SDValue Chain = Op->getOperand(0);
10095 SDValue BarOp = Op->getOperand(2);
10096 SDValue CntOp = Op->getOperand(3);
10097 SDValue M0Val;
10098 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10099 ? AMDGPU::S_BARRIER_INIT_M0
10100 : AMDGPU::S_BARRIER_SIGNAL_M0;
10101 // extract the BarrierID from bits 4-9 of BarOp
10102 SDValue BarID;
10103 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10104 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10105 BarID =
10106 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10107 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10108 0);
10109 // Member count should be put into M0[ShAmt:+6]
10110 // Barrier ID should be put into M0[5:0]
10111 M0Val =
10112 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10113 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10114 0);
10115 constexpr unsigned ShAmt = 16;
10116 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10117 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10118
10119 M0Val = SDValue(
10120 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10121
10122 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10123
10124 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10125 return SDValue(NewMI, 0);
10126 }
10127 case Intrinsic::amdgcn_s_barrier_join: {
10128 // these three intrinsics have one operand: barrier pointer
10129 SDValue Chain = Op->getOperand(0);
10131 SDValue BarOp = Op->getOperand(2);
10132 unsigned Opc;
10133
10134 if (isa<ConstantSDNode>(BarOp)) {
10135 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10136 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10137
10138 // extract the BarrierID from bits 4-9 of the immediate
10139 unsigned BarID = (BarVal >> 4) & 0x3F;
10140 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10141 Ops.push_back(K);
10142 Ops.push_back(Chain);
10143 } else {
10144 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10145
10146 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10147 SDValue M0Val;
10148 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10149 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10150 M0Val =
10151 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10152 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10153 0);
10154 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10155 }
10156
10157 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10158 return SDValue(NewMI, 0);
10159 }
10160 case Intrinsic::amdgcn_s_prefetch_data: {
10161 // For non-global address space preserve the chain and remove the call.
10162 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10163 return Op.getOperand(0);
10164 return Op;
10165 }
10166 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10167 SDValue Ops[] = {
10168 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10169 Op.getOperand(3), // offset
10170 Op.getOperand(4), // length
10171 };
10172
10173 MemSDNode *M = cast<MemSDNode>(Op);
10175 Op->getVTList(), Ops, M->getMemoryVT(),
10176 M->getMemOperand());
10177 }
10178 default: {
10179 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10181 return lowerImage(Op, ImageDimIntr, DAG, true);
10182
10183 return Op;
10184 }
10185 }
10186}
10187
10188// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10189// offset (the offset that is included in bounds checking and swizzling, to be
10190// split between the instruction's voffset and immoffset fields) and soffset
10191// (the offset that is excluded from bounds checking and swizzling, to go in
10192// the instruction's soffset field). This function takes the first kind of
10193// offset and figures out how to split it between voffset and immoffset.
10194std::pair<SDValue, SDValue>
10195SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10196 SDLoc DL(Offset);
10197 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10198 SDValue N0 = Offset;
10199 ConstantSDNode *C1 = nullptr;
10200
10201 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10202 N0 = SDValue();
10203 else if (DAG.isBaseWithConstantOffset(N0)) {
10204 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10205 N0 = N0.getOperand(0);
10206 }
10207
10208 if (C1) {
10209 unsigned ImmOffset = C1->getZExtValue();
10210 // If the immediate value is too big for the immoffset field, put only bits
10211 // that would normally fit in the immoffset field. The remaining value that
10212 // is copied/added for the voffset field is a large power of 2, and it
10213 // stands more chance of being CSEd with the copy/add for another similar
10214 // load/store.
10215 // However, do not do that rounding down if that is a negative
10216 // number, as it appears to be illegal to have a negative offset in the
10217 // vgpr, even if adding the immediate offset makes it positive.
10218 unsigned Overflow = ImmOffset & ~MaxImm;
10219 ImmOffset -= Overflow;
10220 if ((int32_t)Overflow < 0) {
10221 Overflow += ImmOffset;
10222 ImmOffset = 0;
10223 }
10224 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10225 if (Overflow) {
10226 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10227 if (!N0)
10228 N0 = OverflowVal;
10229 else {
10230 SDValue Ops[] = {N0, OverflowVal};
10231 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10232 }
10233 }
10234 }
10235 if (!N0)
10236 N0 = DAG.getConstant(0, DL, MVT::i32);
10237 if (!C1)
10238 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10239 return {N0, SDValue(C1, 0)};
10240}
10241
10242// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10243// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10244// pointed to by Offsets.
10245void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10246 SelectionDAG &DAG, SDValue *Offsets,
10247 Align Alignment) const {
10249 SDLoc DL(CombinedOffset);
10250 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10251 uint32_t Imm = C->getZExtValue();
10252 uint32_t SOffset, ImmOffset;
10253 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10254 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10255 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10256 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10257 return;
10258 }
10259 }
10260 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10261 SDValue N0 = CombinedOffset.getOperand(0);
10262 SDValue N1 = CombinedOffset.getOperand(1);
10263 uint32_t SOffset, ImmOffset;
10264 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10265 if (Offset >= 0 &&
10266 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10267 Offsets[0] = N0;
10268 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10269 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10270 return;
10271 }
10272 }
10273
10274 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10275 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10276 : DAG.getConstant(0, DL, MVT::i32);
10277
10278 Offsets[0] = CombinedOffset;
10279 Offsets[1] = SOffsetZero;
10280 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10281}
10282
10283SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10284 SelectionDAG &DAG) const {
10285 if (!MaybePointer.getValueType().isScalarInteger())
10286 return MaybePointer;
10287
10288 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10289 return Rsrc;
10290}
10291
10292// Wrap a global or flat pointer into a buffer intrinsic using the flags
10293// specified in the intrinsic.
10294SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10295 SelectionDAG &DAG) const {
10296 SDLoc Loc(Op);
10297
10298 SDValue Pointer = Op->getOperand(1);
10299 SDValue Stride = Op->getOperand(2);
10300 SDValue NumRecords = Op->getOperand(3);
10301 SDValue Flags = Op->getOperand(4);
10302
10303 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10304 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10305 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10306 std::optional<uint32_t> ConstStride = std::nullopt;
10307 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10308 ConstStride = ConstNode->getZExtValue();
10309
10310 SDValue NewHighHalf = Masked;
10311 if (!ConstStride || *ConstStride != 0) {
10312 SDValue ShiftedStride;
10313 if (ConstStride) {
10314 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10315 } else {
10316 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10317 ShiftedStride =
10318 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10319 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10320 }
10321 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10322 }
10323
10324 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10325 NewHighHalf, NumRecords, Flags);
10326 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10327 return RsrcPtr;
10328}
10329
10330// Handle 8 bit and 16 bit buffer loads
10331SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10332 EVT LoadVT, SDLoc DL,
10334 MachineMemOperand *MMO,
10335 bool IsTFE) const {
10336 EVT IntVT = LoadVT.changeTypeToInteger();
10337
10338 if (IsTFE) {
10339 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10343 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10344 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10345 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10347 DAG.getConstant(1, DL, MVT::i32));
10348 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10349 DAG.getConstant(0, DL, MVT::i32));
10350 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10351 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10352 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10353 }
10354
10355 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10358
10359 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10360 SDValue BufferLoad =
10361 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10362 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10363 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10364
10365 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10366}
10367
10368// Handle 8 bit and 16 bit buffer stores
10369SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10370 EVT VDataType, SDLoc DL,
10371 SDValue Ops[],
10372 MemSDNode *M) const {
10373 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10374 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10375
10376 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10377 Ops[1] = BufferStoreExt;
10378 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10380 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10381 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10382 M->getMemOperand());
10383}
10384
10386 SDValue Op, const SDLoc &SL, EVT VT) {
10387 if (VT.bitsLT(Op.getValueType()))
10388 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10389
10390 switch (ExtType) {
10391 case ISD::SEXTLOAD:
10392 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10393 case ISD::ZEXTLOAD:
10394 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10395 case ISD::EXTLOAD:
10396 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10397 case ISD::NON_EXTLOAD:
10398 return Op;
10399 }
10400
10401 llvm_unreachable("invalid ext type");
10402}
10403
10404// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10405// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10406SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10407 DAGCombinerInfo &DCI) const {
10408 SelectionDAG &DAG = DCI.DAG;
10409 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10410 return SDValue();
10411
10412 // FIXME: Constant loads should all be marked invariant.
10413 unsigned AS = Ld->getAddressSpace();
10414 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10416 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10417 return SDValue();
10418
10419 // Don't do this early, since it may interfere with adjacent load merging for
10420 // illegal types. We can avoid losing alignment information for exotic types
10421 // pre-legalize.
10422 EVT MemVT = Ld->getMemoryVT();
10423 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10424 MemVT.getSizeInBits() >= 32)
10425 return SDValue();
10426
10427 SDLoc SL(Ld);
10428
10429 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10430 "unexpected vector extload");
10431
10432 // TODO: Drop only high part of range.
10433 SDValue Ptr = Ld->getBasePtr();
10434 SDValue NewLoad = DAG.getLoad(
10435 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10436 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10437 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10438 nullptr); // Drop ranges
10439
10440 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10441 if (MemVT.isFloatingPoint()) {
10443 "unexpected fp extload");
10444 TruncVT = MemVT.changeTypeToInteger();
10445 }
10446
10447 SDValue Cvt = NewLoad;
10448 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10449 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10450 DAG.getValueType(TruncVT));
10451 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10453 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10454 } else {
10456 }
10457
10458 EVT VT = Ld->getValueType(0);
10459 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10460
10461 DCI.AddToWorklist(Cvt.getNode());
10462
10463 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10464 // the appropriate extension from the 32-bit load.
10465 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10466 DCI.AddToWorklist(Cvt.getNode());
10467
10468 // Handle conversion back to floating point if necessary.
10469 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10470
10471 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10472}
10473
10475 const SIMachineFunctionInfo &Info) {
10476 // TODO: Should check if the address can definitely not access stack.
10477 if (Info.isEntryFunction())
10478 return Info.getUserSGPRInfo().hasFlatScratchInit();
10479 return true;
10480}
10481
10482SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10483 SDLoc DL(Op);
10484 LoadSDNode *Load = cast<LoadSDNode>(Op);
10485 ISD::LoadExtType ExtType = Load->getExtensionType();
10486 EVT MemVT = Load->getMemoryVT();
10487 MachineMemOperand *MMO = Load->getMemOperand();
10488
10489 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10490 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10491 return SDValue();
10492
10493 // FIXME: Copied from PPC
10494 // First, load into 32 bits, then truncate to 1 bit.
10495
10496 SDValue Chain = Load->getChain();
10497 SDValue BasePtr = Load->getBasePtr();
10498
10499 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10500
10501 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10502 RealMemVT, MMO);
10503
10504 if (!MemVT.isVector()) {
10505 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10506 NewLD.getValue(1)};
10507
10508 return DAG.getMergeValues(Ops, DL);
10509 }
10510
10512 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10513 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10514 DAG.getConstant(I, DL, MVT::i32));
10515
10516 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10517 }
10518
10519 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10520
10521 return DAG.getMergeValues(Ops, DL);
10522 }
10523
10524 if (!MemVT.isVector())
10525 return SDValue();
10526
10527 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10528 "Custom lowering for non-i32 vectors hasn't been implemented.");
10529
10530 Align Alignment = Load->getAlign();
10531 unsigned AS = Load->getAddressSpace();
10532 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10533 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10534 return SplitVectorLoad(Op, DAG);
10535 }
10536
10539 // If there is a possibility that flat instruction access scratch memory
10540 // then we need to use the same legalization rules we use for private.
10541 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10543 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10546
10547 unsigned NumElements = MemVT.getVectorNumElements();
10548
10549 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10551 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10552 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10554 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10555 Alignment >= Align(4) && NumElements < 32) {
10556 if (MemVT.isPow2VectorType() ||
10557 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10558 return SDValue();
10559 return WidenOrSplitVectorLoad(Op, DAG);
10560 }
10561 // Non-uniform loads will be selected to MUBUF instructions, so they
10562 // have the same legalization requirements as global and private
10563 // loads.
10564 //
10565 }
10566 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10569 if (NumElements > 4)
10570 return SplitVectorLoad(Op, DAG);
10571 // v3 loads not supported on SI.
10572 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10573 return WidenOrSplitVectorLoad(Op, DAG);
10574
10575 // v3 and v4 loads are supported for private and global memory.
10576 return SDValue();
10577 }
10578 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10579 // Depending on the setting of the private_element_size field in the
10580 // resource descriptor, we can only make private accesses up to a certain
10581 // size.
10582 switch (Subtarget->getMaxPrivateElementSize()) {
10583 case 4: {
10584 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10585 return DAG.getMergeValues({Op0, Op1}, DL);
10586 }
10587 case 8:
10588 if (NumElements > 2)
10589 return SplitVectorLoad(Op, DAG);
10590 return SDValue();
10591 case 16:
10592 // Same as global/flat
10593 if (NumElements > 4)
10594 return SplitVectorLoad(Op, DAG);
10595 // v3 loads not supported on SI.
10596 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10597 return WidenOrSplitVectorLoad(Op, DAG);
10598
10599 return SDValue();
10600 default:
10601 llvm_unreachable("unsupported private_element_size");
10602 }
10603 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10604 unsigned Fast = 0;
10605 auto Flags = Load->getMemOperand()->getFlags();
10607 Load->getAlign(), Flags, &Fast) &&
10608 Fast > 1)
10609 return SDValue();
10610
10611 if (MemVT.isVector())
10612 return SplitVectorLoad(Op, DAG);
10613 }
10614
10616 MemVT, *Load->getMemOperand())) {
10617 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10618 return DAG.getMergeValues({Op0, Op1}, DL);
10619 }
10620
10621 return SDValue();
10622}
10623
10624SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10625 EVT VT = Op.getValueType();
10626 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10627 VT.getSizeInBits() == 512)
10628 return splitTernaryVectorOp(Op, DAG);
10629
10630 assert(VT.getSizeInBits() == 64);
10631
10632 SDLoc DL(Op);
10633 SDValue Cond = Op.getOperand(0);
10634
10635 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10636 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10637
10638 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10639 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10640
10641 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10642 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10643
10644 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10645
10646 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10647 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10648
10649 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10650
10651 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10652 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10653}
10654
10655// Catch division cases where we can use shortcuts with rcp and rsq
10656// instructions.
10657SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10658 SelectionDAG &DAG) const {
10659 SDLoc SL(Op);
10660 SDValue LHS = Op.getOperand(0);
10661 SDValue RHS = Op.getOperand(1);
10662 EVT VT = Op.getValueType();
10663 const SDNodeFlags Flags = Op->getFlags();
10664
10665 bool AllowInaccurateRcp =
10666 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10667
10668 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10669 // Without !fpmath accuracy information, we can't do more because we don't
10670 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10671 // f16 is always accurate enough
10672 if (!AllowInaccurateRcp && VT != MVT::f16)
10673 return SDValue();
10674
10675 if (CLHS->isExactlyValue(1.0)) {
10676 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10677 // the CI documentation has a worst case error of 1 ulp.
10678 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10679 // use it as long as we aren't trying to use denormals.
10680 //
10681 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10682
10683 // 1.0 / sqrt(x) -> rsq(x)
10684
10685 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10686 // error seems really high at 2^29 ULP.
10687 // 1.0 / x -> rcp(x)
10688 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10689 }
10690
10691 // Same as for 1.0, but expand the sign out of the constant.
10692 if (CLHS->isExactlyValue(-1.0)) {
10693 // -1.0 / x -> rcp (fneg x)
10694 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10695 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10696 }
10697 }
10698
10699 // For f16 require afn or arcp.
10700 // For f32 require afn.
10701 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10702 return SDValue();
10703
10704 // Turn into multiply by the reciprocal.
10705 // x / y -> x * (1.0 / y)
10706 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10707 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10708}
10709
10710SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10711 SelectionDAG &DAG) const {
10712 SDLoc SL(Op);
10713 SDValue X = Op.getOperand(0);
10714 SDValue Y = Op.getOperand(1);
10715 EVT VT = Op.getValueType();
10716 const SDNodeFlags Flags = Op->getFlags();
10717
10718 bool AllowInaccurateDiv =
10719 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10720 if (!AllowInaccurateDiv)
10721 return SDValue();
10722
10723 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10724 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10725
10726 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10727 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10728
10729 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10730 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10731 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10732 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10733 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10734 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10735}
10736
10737static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10738 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10739 SDNodeFlags Flags) {
10740 if (GlueChain->getNumValues() <= 1) {
10741 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10742 }
10743
10744 assert(GlueChain->getNumValues() == 3);
10745
10746 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10747 switch (Opcode) {
10748 default:
10749 llvm_unreachable("no chain equivalent for opcode");
10750 case ISD::FMUL:
10751 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10752 break;
10753 }
10754
10755 return DAG.getNode(Opcode, SL, VTList,
10756 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10757 Flags);
10758}
10759
10760static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10761 EVT VT, SDValue A, SDValue B, SDValue C,
10762 SDValue GlueChain, SDNodeFlags Flags) {
10763 if (GlueChain->getNumValues() <= 1) {
10764 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10765 }
10766
10767 assert(GlueChain->getNumValues() == 3);
10768
10769 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10770 switch (Opcode) {
10771 default:
10772 llvm_unreachable("no chain equivalent for opcode");
10773 case ISD::FMA:
10774 Opcode = AMDGPUISD::FMA_W_CHAIN;
10775 break;
10776 }
10777
10778 return DAG.getNode(Opcode, SL, VTList,
10779 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10780 Flags);
10781}
10782
10783SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10784 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10785 return FastLowered;
10786
10787 SDLoc SL(Op);
10788 SDValue LHS = Op.getOperand(0);
10789 SDValue RHS = Op.getOperand(1);
10790
10791 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10792 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10793 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10794 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10795 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10796 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10797 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10798 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10799 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10800 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10801 // q16.u = opx(V_CVT_F16_F32, q32.u);
10802 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10803
10804 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10805 unsigned FMADOpCode =
10807
10808 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10809 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10810 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10811 SDValue Rcp =
10812 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10813 SDValue Quot =
10814 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10815 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10816 Op->getFlags());
10817 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10818 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10819 Op->getFlags());
10820 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10821 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10822 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10823 DAG.getConstant(0xff800000, SL, MVT::i32));
10824 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10825 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10826 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10827 DAG.getTargetConstant(0, SL, MVT::i32));
10828 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10829 Op->getFlags());
10830}
10831
10832// Faster 2.5 ULP division that does not support denormals.
10833SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10834 SDNodeFlags Flags = Op->getFlags();
10835 SDLoc SL(Op);
10836 SDValue LHS = Op.getOperand(1);
10837 SDValue RHS = Op.getOperand(2);
10838
10839 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10840
10841 const APFloat K0Val(0x1p+96f);
10842 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10843
10844 const APFloat K1Val(0x1p-32f);
10845 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10846
10847 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10848
10849 EVT SetCCVT =
10850 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10851
10852 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10853
10854 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10855
10856 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10857
10858 // rcp does not support denormals.
10859 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10860
10861 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10862
10863 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10864}
10865
10866// Returns immediate value for setting the F32 denorm mode when using the
10867// S_DENORM_MODE instruction.
10869 const SIMachineFunctionInfo *Info,
10870 const GCNSubtarget *ST) {
10871 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10872 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10873 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10874 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10875}
10876
10877SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10878 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10879 return FastLowered;
10880
10881 // The selection matcher assumes anything with a chain selecting to a
10882 // mayRaiseFPException machine instruction. Since we're introducing a chain
10883 // here, we need to explicitly report nofpexcept for the regular fdiv
10884 // lowering.
10885 SDNodeFlags Flags = Op->getFlags();
10886 Flags.setNoFPExcept(true);
10887
10888 SDLoc SL(Op);
10889 SDValue LHS = Op.getOperand(0);
10890 SDValue RHS = Op.getOperand(1);
10891
10892 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10893
10894 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10895
10896 SDValue DenominatorScaled =
10897 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10898 SDValue NumeratorScaled =
10899 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10900
10901 // Denominator is scaled to not be denormal, so using rcp is ok.
10902 SDValue ApproxRcp =
10903 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10904 SDValue NegDivScale0 =
10905 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10906
10907 using namespace AMDGPU::Hwreg;
10908 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10909 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10910
10911 const MachineFunction &MF = DAG.getMachineFunction();
10913 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10914
10915 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10916 const bool HasDynamicDenormals =
10917 (DenormMode.Input == DenormalMode::Dynamic) ||
10918 (DenormMode.Output == DenormalMode::Dynamic);
10919
10920 SDValue SavedDenormMode;
10921
10922 if (!PreservesDenormals) {
10923 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10924 // lowering. The chain dependence is insufficient, and we need glue. We do
10925 // not need the glue variants in a strictfp function.
10926
10927 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10928
10929 SDValue Glue = DAG.getEntryNode();
10930 if (HasDynamicDenormals) {
10931 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10932 DAG.getVTList(MVT::i32, MVT::Glue),
10933 {BitField, Glue});
10934 SavedDenormMode = SDValue(GetReg, 0);
10935
10936 Glue = DAG.getMergeValues(
10937 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10938 }
10939
10940 SDNode *EnableDenorm;
10941 if (Subtarget->hasDenormModeInst()) {
10942 const SDValue EnableDenormValue =
10943 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10944
10945 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10946 EnableDenormValue)
10947 .getNode();
10948 } else {
10949 const SDValue EnableDenormValue =
10950 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
10951 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10952 {EnableDenormValue, BitField, Glue});
10953 }
10954
10955 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
10956 SDValue(EnableDenorm, 1)};
10957
10958 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10959 }
10960
10961 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10962 ApproxRcp, One, NegDivScale0, Flags);
10963
10964 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10965 ApproxRcp, Fma0, Flags);
10966
10967 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
10968 Fma1, Flags);
10969
10970 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10971 NumeratorScaled, Mul, Flags);
10972
10973 SDValue Fma3 =
10974 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
10975
10976 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10977 NumeratorScaled, Fma3, Flags);
10978
10979 if (!PreservesDenormals) {
10980 SDNode *DisableDenorm;
10981 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10982 const SDValue DisableDenormValue = getSPDenormModeValue(
10983 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10984
10985 DisableDenorm =
10986 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
10987 DisableDenormValue, Fma4.getValue(2))
10988 .getNode();
10989 } else {
10990 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10991 const SDValue DisableDenormValue =
10992 HasDynamicDenormals
10993 ? SavedDenormMode
10994 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10995
10996 DisableDenorm = DAG.getMachineNode(
10997 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10998 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10999 }
11000
11001 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11002 SDValue(DisableDenorm, 0), DAG.getRoot());
11003 DAG.setRoot(OutputChain);
11004 }
11005
11006 SDValue Scale = NumeratorScaled.getValue(1);
11007 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11008 {Fma4, Fma1, Fma3, Scale}, Flags);
11009
11010 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11011}
11012
11013SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11014 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11015 return FastLowered;
11016
11017 SDLoc SL(Op);
11018 SDValue X = Op.getOperand(0);
11019 SDValue Y = Op.getOperand(1);
11020
11021 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11022
11023 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11024
11025 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11026
11027 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11028
11029 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11030
11031 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11032
11033 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11034
11035 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11036
11037 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11038
11039 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11040 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11041
11042 SDValue Fma4 =
11043 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11044
11045 SDValue Scale;
11046
11047 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11048 // Workaround a hardware bug on SI where the condition output from div_scale
11049 // is not usable.
11050
11051 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11052
11053 // Figure out if the scale to use for div_fmas.
11054 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11055 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11056 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11057 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11058
11059 SDValue NumHi =
11060 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11061 SDValue DenHi =
11062 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11063
11064 SDValue Scale0Hi =
11065 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11066 SDValue Scale1Hi =
11067 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11068
11069 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11070 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11071 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11072 } else {
11073 Scale = DivScale1.getValue(1);
11074 }
11075
11076 SDValue Fmas =
11077 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11078
11079 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11080}
11081
11082SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11083 EVT VT = Op.getValueType();
11084
11085 if (VT == MVT::f32)
11086 return LowerFDIV32(Op, DAG);
11087
11088 if (VT == MVT::f64)
11089 return LowerFDIV64(Op, DAG);
11090
11091 if (VT == MVT::f16)
11092 return LowerFDIV16(Op, DAG);
11093
11094 llvm_unreachable("Unexpected type for fdiv");
11095}
11096
11097SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11098 SDLoc dl(Op);
11099 SDValue Val = Op.getOperand(0);
11100 EVT VT = Val.getValueType();
11101 EVT ResultExpVT = Op->getValueType(1);
11102 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11103
11104 SDValue Mant = DAG.getNode(
11106 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11107
11108 SDValue Exp = DAG.getNode(
11109 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11110 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11111
11112 if (Subtarget->hasFractBug()) {
11113 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11114 SDValue Inf =
11116
11117 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11118 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11119 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11120 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11121 }
11122
11123 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11124 return DAG.getMergeValues({Mant, CastExp}, dl);
11125}
11126
11127SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11128 SDLoc DL(Op);
11129 StoreSDNode *Store = cast<StoreSDNode>(Op);
11130 EVT VT = Store->getMemoryVT();
11131
11132 if (VT == MVT::i1) {
11133 return DAG.getTruncStore(
11134 Store->getChain(), DL,
11135 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11136 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11137 }
11138
11139 assert(VT.isVector() &&
11140 Store->getValue().getValueType().getScalarType() == MVT::i32);
11141
11142 unsigned AS = Store->getAddressSpace();
11143 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11144 Store->getAlign().value() < VT.getStoreSize() &&
11145 VT.getSizeInBits() > 32) {
11146 return SplitVectorStore(Op, DAG);
11147 }
11148
11151 // If there is a possibility that flat instruction access scratch memory
11152 // then we need to use the same legalization rules we use for private.
11153 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11155 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11158
11159 unsigned NumElements = VT.getVectorNumElements();
11161 if (NumElements > 4)
11162 return SplitVectorStore(Op, DAG);
11163 // v3 stores not supported on SI.
11164 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11165 return SplitVectorStore(Op, DAG);
11166
11168 VT, *Store->getMemOperand()))
11169 return expandUnalignedStore(Store, DAG);
11170
11171 return SDValue();
11172 }
11173 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11174 switch (Subtarget->getMaxPrivateElementSize()) {
11175 case 4:
11176 return scalarizeVectorStore(Store, DAG);
11177 case 8:
11178 if (NumElements > 2)
11179 return SplitVectorStore(Op, DAG);
11180 return SDValue();
11181 case 16:
11182 if (NumElements > 4 ||
11183 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11184 return SplitVectorStore(Op, DAG);
11185 return SDValue();
11186 default:
11187 llvm_unreachable("unsupported private_element_size");
11188 }
11189 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11190 unsigned Fast = 0;
11191 auto Flags = Store->getMemOperand()->getFlags();
11193 Store->getAlign(), Flags, &Fast) &&
11194 Fast > 1)
11195 return SDValue();
11196
11197 if (VT.isVector())
11198 return SplitVectorStore(Op, DAG);
11199
11200 return expandUnalignedStore(Store, DAG);
11201 }
11202
11203 // Probably an invalid store. If so we'll end up emitting a selection error.
11204 return SDValue();
11205}
11206
11207// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11208SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11209 SDLoc SL(Op);
11210 assert(!Subtarget->has16BitInsts());
11211 SDNodeFlags Flags = Op->getFlags();
11212 SDValue Ext =
11213 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11214
11215 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11216 SDValue Sqrt =
11217 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11218
11219 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11220 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11221}
11222
11223SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11224 SDLoc DL(Op);
11225 SDNodeFlags Flags = Op->getFlags();
11226 MVT VT = Op.getValueType().getSimpleVT();
11227 const SDValue X = Op.getOperand(0);
11228
11229 if (allowApproxFunc(DAG, Flags)) {
11230 // Instruction is 1ulp but ignores denormals.
11231 return DAG.getNode(
11233 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11234 }
11235
11236 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11237 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11238
11239 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11240
11241 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11242
11243 SDValue SqrtX =
11244 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11245
11246 SDValue SqrtS;
11247 if (needsDenormHandlingF32(DAG, X, Flags)) {
11248 SDValue SqrtID =
11249 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11250 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11251
11252 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11253 SDValue SqrtSNextDownInt =
11254 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11255 DAG.getAllOnesConstant(DL, MVT::i32));
11256 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11257
11258 SDValue NegSqrtSNextDown =
11259 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11260
11261 SDValue SqrtVP =
11262 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11263
11264 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11265 DAG.getConstant(1, DL, MVT::i32));
11266 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11267
11268 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11269 SDValue SqrtVS =
11270 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11271
11272 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11273 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11274
11275 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11276 Flags);
11277
11278 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11279 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11280 Flags);
11281 } else {
11282 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11283
11284 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11285
11286 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11287 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11288 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11289
11290 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11291 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11292 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11293
11294 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11295 SDValue SqrtD =
11296 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11297 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11298 }
11299
11300 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11301
11302 SDValue ScaledDown =
11303 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11304
11305 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11306 SDValue IsZeroOrInf =
11307 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11308 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11309
11310 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11311}
11312
11313SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11314 // For double type, the SQRT and RSQ instructions don't have required
11315 // precision, we apply Goldschmidt's algorithm to improve the result:
11316 //
11317 // y0 = rsq(x)
11318 // g0 = x * y0
11319 // h0 = 0.5 * y0
11320 //
11321 // r0 = 0.5 - h0 * g0
11322 // g1 = g0 * r0 + g0
11323 // h1 = h0 * r0 + h0
11324 //
11325 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11326 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11327 // h2 = h1 * r1 + h1
11328 //
11329 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11330 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11331 //
11332 // sqrt(x) = g3
11333
11334 SDNodeFlags Flags = Op->getFlags();
11335
11336 SDLoc DL(Op);
11337
11338 SDValue X = Op.getOperand(0);
11339 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11340
11341 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11342
11343 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11344
11345 // Scale up input if it is too small.
11346 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11347 SDValue ScaleUp =
11348 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11349 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11350
11351 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11352
11353 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11354
11355 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11356 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11357
11358 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11359 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11360
11361 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11362
11363 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11364
11365 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11366 SDValue SqrtD0 =
11367 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11368
11369 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11370
11371 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11372 SDValue SqrtD1 =
11373 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11374
11375 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11376
11377 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11378 SDValue ScaleDown =
11379 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11380 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11381
11382 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11383 // with finite only or nsz because rsq(+/-0) = +/-inf
11384
11385 // TODO: Check for DAZ and expand to subnormals
11386 SDValue IsZeroOrInf =
11387 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11388 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11389
11390 // If x is +INF, +0, or -0, use its original value
11391 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11392 Flags);
11393}
11394
11395SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11396 SDLoc DL(Op);
11397 EVT VT = Op.getValueType();
11398 SDValue Arg = Op.getOperand(0);
11399 SDValue TrigVal;
11400
11401 // Propagate fast-math flags so that the multiply we introduce can be folded
11402 // if Arg is already the result of a multiply by constant.
11403 auto Flags = Op->getFlags();
11404
11405 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11406
11407 if (Subtarget->hasTrigReducedRange()) {
11408 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11409 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11410 } else {
11411 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11412 }
11413
11414 switch (Op.getOpcode()) {
11415 case ISD::FCOS:
11416 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11417 case ISD::FSIN:
11418 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11419 default:
11420 llvm_unreachable("Wrong trig opcode");
11421 }
11422}
11423
11424SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11425 SelectionDAG &DAG) const {
11426 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11427 assert(AtomicNode->isCompareAndSwap());
11428 unsigned AS = AtomicNode->getAddressSpace();
11429
11430 // No custom lowering required for local address space
11432 return Op;
11433
11434 // Non-local address space requires custom lowering for atomic compare
11435 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11436 SDLoc DL(Op);
11437 SDValue ChainIn = Op.getOperand(0);
11438 SDValue Addr = Op.getOperand(1);
11439 SDValue Old = Op.getOperand(2);
11440 SDValue New = Op.getOperand(3);
11441 EVT VT = Op.getValueType();
11442 MVT SimpleVT = VT.getSimpleVT();
11443 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11444
11445 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11446 SDValue Ops[] = {ChainIn, Addr, NewOld};
11447
11449 Op->getVTList(), Ops, VT,
11450 AtomicNode->getMemOperand());
11451}
11452
11453//===----------------------------------------------------------------------===//
11454// Custom DAG optimizations
11455//===----------------------------------------------------------------------===//
11456
11457SDValue
11458SITargetLowering::performUCharToFloatCombine(SDNode *N,
11459 DAGCombinerInfo &DCI) const {
11460 EVT VT = N->getValueType(0);
11461 EVT ScalarVT = VT.getScalarType();
11462 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11463 return SDValue();
11464
11465 SelectionDAG &DAG = DCI.DAG;
11466 SDLoc DL(N);
11467
11468 SDValue Src = N->getOperand(0);
11469 EVT SrcVT = Src.getValueType();
11470
11471 // TODO: We could try to match extracting the higher bytes, which would be
11472 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11473 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11474 // about in practice.
11475 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11476 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11477 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11478 DCI.AddToWorklist(Cvt.getNode());
11479
11480 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11481 if (ScalarVT != MVT::f32) {
11482 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11483 DAG.getTargetConstant(0, DL, MVT::i32));
11484 }
11485 return Cvt;
11486 }
11487 }
11488
11489 return SDValue();
11490}
11491
11492SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11493 DAGCombinerInfo &DCI) const {
11494 SDValue MagnitudeOp = N->getOperand(0);
11495 SDValue SignOp = N->getOperand(1);
11496 SelectionDAG &DAG = DCI.DAG;
11497 SDLoc DL(N);
11498
11499 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11500 // lower half with a copy.
11501 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11502 if (MagnitudeOp.getValueType() == MVT::f64) {
11503 SDValue MagAsVector =
11504 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11505 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11506 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11507 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11508 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11509
11510 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11511
11512 SDValue Vector =
11513 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11514
11515 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11516 }
11517
11518 if (SignOp.getValueType() != MVT::f64)
11519 return SDValue();
11520
11521 // Reduce width of sign operand, we only need the highest bit.
11522 //
11523 // fcopysign f64:x, f64:y ->
11524 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11525 // TODO: In some cases it might make sense to go all the way to f16.
11526 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11527 SDValue SignAsF32 =
11528 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11529 DAG.getConstant(1, DL, MVT::i32));
11530
11531 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11532 SignAsF32);
11533}
11534
11535// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11536// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11537// bits
11538
11539// This is a variant of
11540// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11541//
11542// The normal DAG combiner will do this, but only if the add has one use since
11543// that would increase the number of instructions.
11544//
11545// This prevents us from seeing a constant offset that can be folded into a
11546// memory instruction's addressing mode. If we know the resulting add offset of
11547// a pointer can be folded into an addressing offset, we can replace the pointer
11548// operand with the add of new constant offset. This eliminates one of the uses,
11549// and may allow the remaining use to also be simplified.
11550//
11551SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11552 EVT MemVT,
11553 DAGCombinerInfo &DCI) const {
11554 SDValue N0 = N->getOperand(0);
11555 SDValue N1 = N->getOperand(1);
11556
11557 // We only do this to handle cases where it's profitable when there are
11558 // multiple uses of the add, so defer to the standard combine.
11559 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11560 N0->hasOneUse())
11561 return SDValue();
11562
11563 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11564 if (!CN1)
11565 return SDValue();
11566
11567 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11568 if (!CAdd)
11569 return SDValue();
11570
11571 SelectionDAG &DAG = DCI.DAG;
11572
11573 if (N0->getOpcode() == ISD::OR &&
11574 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11575 return SDValue();
11576
11577 // If the resulting offset is too large, we can't fold it into the
11578 // addressing mode offset.
11579 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11580 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11581
11582 AddrMode AM;
11583 AM.HasBaseReg = true;
11584 AM.BaseOffs = Offset.getSExtValue();
11585 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11586 return SDValue();
11587
11588 SDLoc SL(N);
11589 EVT VT = N->getValueType(0);
11590
11591 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11592 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11593
11595 Flags.setNoUnsignedWrap(
11596 N->getFlags().hasNoUnsignedWrap() &&
11597 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11598
11599 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11600}
11601
11602/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11603/// by the chain and intrinsic ID. Theoretically we would also need to check the
11604/// specific intrinsic, but they all place the pointer operand first.
11605static unsigned getBasePtrIndex(const MemSDNode *N) {
11606 switch (N->getOpcode()) {
11607 case ISD::STORE:
11610 return 2;
11611 default:
11612 return 1;
11613 }
11614}
11615
11616SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11617 DAGCombinerInfo &DCI) const {
11618 SelectionDAG &DAG = DCI.DAG;
11619 SDLoc SL(N);
11620
11621 unsigned PtrIdx = getBasePtrIndex(N);
11622 SDValue Ptr = N->getOperand(PtrIdx);
11623
11624 // TODO: We could also do this for multiplies.
11625 if (Ptr.getOpcode() == ISD::SHL) {
11626 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11627 N->getMemoryVT(), DCI);
11628 if (NewPtr) {
11629 SmallVector<SDValue, 8> NewOps(N->ops());
11630
11631 NewOps[PtrIdx] = NewPtr;
11632 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11633 }
11634 }
11635
11636 return SDValue();
11637}
11638
11639static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11640 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11641 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11642 (Opc == ISD::XOR && Val == 0);
11643}
11644
11645// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11646// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11647// integer combine opportunities since most 64-bit operations are decomposed
11648// this way. TODO: We won't want this for SALU especially if it is an inline
11649// immediate.
11650SDValue SITargetLowering::splitBinaryBitConstantOp(
11651 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11652 const ConstantSDNode *CRHS) const {
11653 uint64_t Val = CRHS->getZExtValue();
11654 uint32_t ValLo = Lo_32(Val);
11655 uint32_t ValHi = Hi_32(Val);
11657
11658 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11659 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11660 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11661 // If we need to materialize a 64-bit immediate, it will be split up later
11662 // anyway. Avoid creating the harder to understand 64-bit immediate
11663 // materialization.
11664 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11665 }
11666
11667 return SDValue();
11668}
11669
11671 if (V.getValueType() != MVT::i1)
11672 return false;
11673 switch (V.getOpcode()) {
11674 default:
11675 break;
11676 case ISD::SETCC:
11678 return true;
11679 case ISD::AND:
11680 case ISD::OR:
11681 case ISD::XOR:
11682 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11683 }
11684 return false;
11685}
11686
11687// If a constant has all zeroes or all ones within each byte return it.
11688// Otherwise return 0.
11690 // 0xff for any zero byte in the mask
11691 uint32_t ZeroByteMask = 0;
11692 if (!(C & 0x000000ff))
11693 ZeroByteMask |= 0x000000ff;
11694 if (!(C & 0x0000ff00))
11695 ZeroByteMask |= 0x0000ff00;
11696 if (!(C & 0x00ff0000))
11697 ZeroByteMask |= 0x00ff0000;
11698 if (!(C & 0xff000000))
11699 ZeroByteMask |= 0xff000000;
11700 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11701 if ((NonZeroByteMask & C) != NonZeroByteMask)
11702 return 0; // Partial bytes selected.
11703 return C;
11704}
11705
11706// Check if a node selects whole bytes from its operand 0 starting at a byte
11707// boundary while masking the rest. Returns select mask as in the v_perm_b32
11708// or -1 if not succeeded.
11709// Note byte select encoding:
11710// value 0-3 selects corresponding source byte;
11711// value 0xc selects zero;
11712// value 0xff selects 0xff.
11714 assert(V.getValueSizeInBits() == 32);
11715
11716 if (V.getNumOperands() != 2)
11717 return ~0;
11718
11719 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11720 if (!N1)
11721 return ~0;
11722
11723 uint32_t C = N1->getZExtValue();
11724
11725 switch (V.getOpcode()) {
11726 default:
11727 break;
11728 case ISD::AND:
11729 if (uint32_t ConstMask = getConstantPermuteMask(C))
11730 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11731 break;
11732
11733 case ISD::OR:
11734 if (uint32_t ConstMask = getConstantPermuteMask(C))
11735 return (0x03020100 & ~ConstMask) | ConstMask;
11736 break;
11737
11738 case ISD::SHL:
11739 if (C % 8)
11740 return ~0;
11741
11742 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11743
11744 case ISD::SRL:
11745 if (C % 8)
11746 return ~0;
11747
11748 return uint32_t(0x0c0c0c0c03020100ull >> C);
11749 }
11750
11751 return ~0;
11752}
11753
11754SDValue SITargetLowering::performAndCombine(SDNode *N,
11755 DAGCombinerInfo &DCI) const {
11756 if (DCI.isBeforeLegalize())
11757 return SDValue();
11758
11759 SelectionDAG &DAG = DCI.DAG;
11760 EVT VT = N->getValueType(0);
11761 SDValue LHS = N->getOperand(0);
11762 SDValue RHS = N->getOperand(1);
11763
11764 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11765 if (VT == MVT::i64 && CRHS) {
11766 if (SDValue Split =
11767 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11768 return Split;
11769 }
11770
11771 if (CRHS && VT == MVT::i32) {
11772 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11773 // nb = number of trailing zeroes in mask
11774 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11775 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11776 uint64_t Mask = CRHS->getZExtValue();
11777 unsigned Bits = llvm::popcount(Mask);
11778 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11779 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11780 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11781 unsigned Shift = CShift->getZExtValue();
11782 unsigned NB = CRHS->getAPIntValue().countr_zero();
11783 unsigned Offset = NB + Shift;
11784 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11785 SDLoc SL(N);
11786 SDValue BFE =
11787 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11788 DAG.getConstant(Offset, SL, MVT::i32),
11789 DAG.getConstant(Bits, SL, MVT::i32));
11790 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11791 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11792 DAG.getValueType(NarrowVT));
11793 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11794 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11795 return Shl;
11796 }
11797 }
11798 }
11799
11800 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11801 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11802 isa<ConstantSDNode>(LHS.getOperand(2))) {
11803 uint32_t Sel = getConstantPermuteMask(Mask);
11804 if (!Sel)
11805 return SDValue();
11806
11807 // Select 0xc for all zero bytes
11808 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11809 SDLoc DL(N);
11810 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11811 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11812 }
11813 }
11814
11815 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11816 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11817 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11818 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11819 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11820
11821 SDValue X = LHS.getOperand(0);
11822 SDValue Y = RHS.getOperand(0);
11823 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11824 !isTypeLegal(X.getValueType()))
11825 return SDValue();
11826
11827 if (LCC == ISD::SETO) {
11828 if (X != LHS.getOperand(1))
11829 return SDValue();
11830
11831 if (RCC == ISD::SETUNE) {
11832 const ConstantFPSDNode *C1 =
11833 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11834 if (!C1 || !C1->isInfinity() || C1->isNegative())
11835 return SDValue();
11836
11841
11842 static_assert(
11845 0x3ff) == Mask,
11846 "mask not equal");
11847
11848 SDLoc DL(N);
11849 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11850 DAG.getConstant(Mask, DL, MVT::i32));
11851 }
11852 }
11853 }
11854
11855 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11856 std::swap(LHS, RHS);
11857
11858 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11859 RHS.hasOneUse()) {
11860 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11861 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11862 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11863 // | n_nan)
11864 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11865 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11866 (RHS.getOperand(0) == LHS.getOperand(0) &&
11867 LHS.getOperand(0) == LHS.getOperand(1))) {
11868 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11869 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11870 : Mask->getZExtValue() & OrdMask;
11871
11872 SDLoc DL(N);
11873 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11874 DAG.getConstant(NewMask, DL, MVT::i32));
11875 }
11876 }
11877
11878 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11879 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11880 // and x, (sext cc from i1) => select cc, x, 0
11881 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11882 std::swap(LHS, RHS);
11883 if (isBoolSGPR(RHS.getOperand(0)))
11884 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11885 DAG.getConstant(0, SDLoc(N), MVT::i32));
11886 }
11887
11888 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11890 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11891 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11892 uint32_t LHSMask = getPermuteMask(LHS);
11893 uint32_t RHSMask = getPermuteMask(RHS);
11894 if (LHSMask != ~0u && RHSMask != ~0u) {
11895 // Canonicalize the expression in an attempt to have fewer unique masks
11896 // and therefore fewer registers used to hold the masks.
11897 if (LHSMask > RHSMask) {
11898 std::swap(LHSMask, RHSMask);
11899 std::swap(LHS, RHS);
11900 }
11901
11902 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11903 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11904 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11905 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11906
11907 // Check of we need to combine values from two sources within a byte.
11908 if (!(LHSUsedLanes & RHSUsedLanes) &&
11909 // If we select high and lower word keep it for SDWA.
11910 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11911 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11912 // Each byte in each mask is either selector mask 0-3, or has higher
11913 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11914 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11915 // mask which is not 0xff wins. By anding both masks we have a correct
11916 // result except that 0x0c shall be corrected to give 0x0c only.
11917 uint32_t Mask = LHSMask & RHSMask;
11918 for (unsigned I = 0; I < 32; I += 8) {
11919 uint32_t ByteSel = 0xff << I;
11920 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11921 Mask &= (0x0c << I) & 0xffffffff;
11922 }
11923
11924 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11925 // or 0x0c.
11926 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11927 SDLoc DL(N);
11928
11929 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11930 RHS.getOperand(0),
11931 DAG.getConstant(Sel, DL, MVT::i32));
11932 }
11933 }
11934 }
11935
11936 return SDValue();
11937}
11938
11939// A key component of v_perm is a mapping between byte position of the src
11940// operands, and the byte position of the dest. To provide such, we need: 1. the
11941// node that provides x byte of the dest of the OR, and 2. the byte of the node
11942// used to provide that x byte. calculateByteProvider finds which node provides
11943// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11944// and finds an ultimate src and byte position For example: The supported
11945// LoadCombine pattern for vector loads is as follows
11946// t1
11947// or
11948// / \
11949// t2 t3
11950// zext shl
11951// | | \
11952// t4 t5 16
11953// or anyext
11954// / \ |
11955// t6 t7 t8
11956// srl shl or
11957// / | / \ / \
11958// t9 t10 t11 t12 t13 t14
11959// trunc* 8 trunc* 8 and and
11960// | | / | | \
11961// t15 t16 t17 t18 t19 t20
11962// trunc* 255 srl -256
11963// | / \
11964// t15 t15 16
11965//
11966// *In this example, the truncs are from i32->i16
11967//
11968// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11969// respectively. calculateSrcByte would find (given node) -> ultimate src &
11970// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11971// After finding the mapping, we can combine the tree into vperm t15, t16,
11972// 0x05000407
11973
11974// Find the source and byte position from a node.
11975// \p DestByte is the byte position of the dest of the or that the src
11976// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11977// dest of the or byte. \p Depth tracks how many recursive iterations we have
11978// performed.
11979static const std::optional<ByteProvider<SDValue>>
11980calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11981 unsigned Depth = 0) {
11982 // We may need to recursively traverse a series of SRLs
11983 if (Depth >= 6)
11984 return std::nullopt;
11985
11986 if (Op.getValueSizeInBits() < 8)
11987 return std::nullopt;
11988
11989 if (Op.getValueType().isVector())
11990 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11991
11992 switch (Op->getOpcode()) {
11993 case ISD::TRUNCATE: {
11994 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11995 }
11996
11997 case ISD::SIGN_EXTEND:
11998 case ISD::ZERO_EXTEND:
12000 SDValue NarrowOp = Op->getOperand(0);
12001 auto NarrowVT = NarrowOp.getValueType();
12002 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12003 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12004 NarrowVT = VTSign->getVT();
12005 }
12006 if (!NarrowVT.isByteSized())
12007 return std::nullopt;
12008 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12009
12010 if (SrcIndex >= NarrowByteWidth)
12011 return std::nullopt;
12012 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12013 }
12014
12015 case ISD::SRA:
12016 case ISD::SRL: {
12017 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12018 if (!ShiftOp)
12019 return std::nullopt;
12020
12021 uint64_t BitShift = ShiftOp->getZExtValue();
12022
12023 if (BitShift % 8 != 0)
12024 return std::nullopt;
12025
12026 SrcIndex += BitShift / 8;
12027
12028 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12029 }
12030
12031 default: {
12032 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12033 }
12034 }
12035 llvm_unreachable("fully handled switch");
12036}
12037
12038// For a byte position in the result of an Or, traverse the tree and find the
12039// node (and the byte of the node) which ultimately provides this {Or,
12040// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12041// the byte position of the Op that corresponds with the originally requested
12042// byte of the Or \p Depth tracks how many recursive iterations we have
12043// performed. \p StartingIndex is the originally requested byte of the Or
12044static const std::optional<ByteProvider<SDValue>>
12045calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12046 unsigned StartingIndex = 0) {
12047 // Finding Src tree of RHS of or typically requires at least 1 additional
12048 // depth
12049 if (Depth > 6)
12050 return std::nullopt;
12051
12052 unsigned BitWidth = Op.getScalarValueSizeInBits();
12053 if (BitWidth % 8 != 0)
12054 return std::nullopt;
12055 if (Index > BitWidth / 8 - 1)
12056 return std::nullopt;
12057
12058 bool IsVec = Op.getValueType().isVector();
12059 switch (Op.getOpcode()) {
12060 case ISD::OR: {
12061 if (IsVec)
12062 return std::nullopt;
12063
12064 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12065 StartingIndex);
12066 if (!RHS)
12067 return std::nullopt;
12068 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12069 StartingIndex);
12070 if (!LHS)
12071 return std::nullopt;
12072 // A well formed Or will have two ByteProviders for each byte, one of which
12073 // is constant zero
12074 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12075 return std::nullopt;
12076 if (!LHS || LHS->isConstantZero())
12077 return RHS;
12078 if (!RHS || RHS->isConstantZero())
12079 return LHS;
12080 return std::nullopt;
12081 }
12082
12083 case ISD::AND: {
12084 if (IsVec)
12085 return std::nullopt;
12086
12087 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12088 if (!BitMaskOp)
12089 return std::nullopt;
12090
12091 uint32_t BitMask = BitMaskOp->getZExtValue();
12092 // Bits we expect for our StartingIndex
12093 uint32_t IndexMask = 0xFF << (Index * 8);
12094
12095 if ((IndexMask & BitMask) != IndexMask) {
12096 // If the result of the and partially provides the byte, then it
12097 // is not well formatted
12098 if (IndexMask & BitMask)
12099 return std::nullopt;
12101 }
12102
12103 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12104 }
12105
12106 case ISD::FSHR: {
12107 if (IsVec)
12108 return std::nullopt;
12109
12110 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12111 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12112 if (!ShiftOp || Op.getValueType().isVector())
12113 return std::nullopt;
12114
12115 uint64_t BitsProvided = Op.getValueSizeInBits();
12116 if (BitsProvided % 8 != 0)
12117 return std::nullopt;
12118
12119 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12120 if (BitShift % 8)
12121 return std::nullopt;
12122
12123 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12124 uint64_t ByteShift = BitShift / 8;
12125
12126 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12127 uint64_t BytesProvided = BitsProvided / 8;
12128 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12129 NewIndex %= BytesProvided;
12130 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12131 }
12132
12133 case ISD::SRA:
12134 case ISD::SRL: {
12135 if (IsVec)
12136 return std::nullopt;
12137
12138 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12139 if (!ShiftOp)
12140 return std::nullopt;
12141
12142 uint64_t BitShift = ShiftOp->getZExtValue();
12143 if (BitShift % 8)
12144 return std::nullopt;
12145
12146 auto BitsProvided = Op.getScalarValueSizeInBits();
12147 if (BitsProvided % 8 != 0)
12148 return std::nullopt;
12149
12150 uint64_t BytesProvided = BitsProvided / 8;
12151 uint64_t ByteShift = BitShift / 8;
12152 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12153 // If the byte we are trying to provide (as tracked by index) falls in this
12154 // range, then the SRL provides the byte. The byte of interest of the src of
12155 // the SRL is Index + ByteShift
12156 return BytesProvided - ByteShift > Index
12157 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12158 Index + ByteShift)
12160 }
12161
12162 case ISD::SHL: {
12163 if (IsVec)
12164 return std::nullopt;
12165
12166 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12167 if (!ShiftOp)
12168 return std::nullopt;
12169
12170 uint64_t BitShift = ShiftOp->getZExtValue();
12171 if (BitShift % 8 != 0)
12172 return std::nullopt;
12173 uint64_t ByteShift = BitShift / 8;
12174
12175 // If we are shifting by an amount greater than (or equal to)
12176 // the index we are trying to provide, then it provides 0s. If not,
12177 // then this bytes are not definitively 0s, and the corresponding byte
12178 // of interest is Index - ByteShift of the src
12179 return Index < ByteShift
12181 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12182 Depth + 1, StartingIndex);
12183 }
12184 case ISD::ANY_EXTEND:
12185 case ISD::SIGN_EXTEND:
12186 case ISD::ZERO_EXTEND:
12188 case ISD::AssertZext:
12189 case ISD::AssertSext: {
12190 if (IsVec)
12191 return std::nullopt;
12192
12193 SDValue NarrowOp = Op->getOperand(0);
12194 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12195 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12196 Op->getOpcode() == ISD::AssertZext ||
12197 Op->getOpcode() == ISD::AssertSext) {
12198 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12199 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12200 }
12201 if (NarrowBitWidth % 8 != 0)
12202 return std::nullopt;
12203 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12204
12205 if (Index >= NarrowByteWidth)
12206 return Op.getOpcode() == ISD::ZERO_EXTEND
12207 ? std::optional<ByteProvider<SDValue>>(
12209 : std::nullopt;
12210 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12211 }
12212
12213 case ISD::TRUNCATE: {
12214 if (IsVec)
12215 return std::nullopt;
12216
12217 uint64_t NarrowByteWidth = BitWidth / 8;
12218
12219 if (NarrowByteWidth >= Index) {
12220 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12221 StartingIndex);
12222 }
12223
12224 return std::nullopt;
12225 }
12226
12227 case ISD::CopyFromReg: {
12228 if (BitWidth / 8 > Index)
12229 return calculateSrcByte(Op, StartingIndex, Index);
12230
12231 return std::nullopt;
12232 }
12233
12234 case ISD::LOAD: {
12235 auto *L = cast<LoadSDNode>(Op.getNode());
12236
12237 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12238 if (NarrowBitWidth % 8 != 0)
12239 return std::nullopt;
12240 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12241
12242 // If the width of the load does not reach byte we are trying to provide for
12243 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12244 // question
12245 if (Index >= NarrowByteWidth) {
12246 return L->getExtensionType() == ISD::ZEXTLOAD
12247 ? std::optional<ByteProvider<SDValue>>(
12249 : std::nullopt;
12250 }
12251
12252 if (NarrowByteWidth > Index) {
12253 return calculateSrcByte(Op, StartingIndex, Index);
12254 }
12255
12256 return std::nullopt;
12257 }
12258
12259 case ISD::BSWAP: {
12260 if (IsVec)
12261 return std::nullopt;
12262
12263 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12264 Depth + 1, StartingIndex);
12265 }
12266
12268 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12269 if (!IdxOp)
12270 return std::nullopt;
12271 auto VecIdx = IdxOp->getZExtValue();
12272 auto ScalarSize = Op.getScalarValueSizeInBits();
12273 if (ScalarSize < 32)
12274 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12275 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12276 StartingIndex, Index);
12277 }
12278
12279 case AMDGPUISD::PERM: {
12280 if (IsVec)
12281 return std::nullopt;
12282
12283 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12284 if (!PermMask)
12285 return std::nullopt;
12286
12287 auto IdxMask =
12288 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12289 if (IdxMask > 0x07 && IdxMask != 0x0c)
12290 return std::nullopt;
12291
12292 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12293 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12294
12295 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12298 }
12299
12300 default: {
12301 return std::nullopt;
12302 }
12303 }
12304
12305 llvm_unreachable("fully handled switch");
12306}
12307
12308// Returns true if the Operand is a scalar and is 16 bits
12309static bool isExtendedFrom16Bits(SDValue &Operand) {
12310
12311 switch (Operand.getOpcode()) {
12312 case ISD::ANY_EXTEND:
12313 case ISD::SIGN_EXTEND:
12314 case ISD::ZERO_EXTEND: {
12315 auto OpVT = Operand.getOperand(0).getValueType();
12316 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12317 }
12318 case ISD::LOAD: {
12319 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12320 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12321 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12322 ExtType == ISD::EXTLOAD) {
12323 auto MemVT = L->getMemoryVT();
12324 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12325 }
12326 return L->getMemoryVT().getSizeInBits() == 16;
12327 }
12328 default:
12329 return false;
12330 }
12331}
12332
12333// Returns true if the mask matches consecutive bytes, and the first byte
12334// begins at a power of 2 byte offset from 0th byte
12335static bool addresses16Bits(int Mask) {
12336 int Low8 = Mask & 0xff;
12337 int Hi8 = (Mask & 0xff00) >> 8;
12338
12339 assert(Low8 < 8 && Hi8 < 8);
12340 // Are the bytes contiguous in the order of increasing addresses.
12341 bool IsConsecutive = (Hi8 - Low8 == 1);
12342 // Is the first byte at location that is aligned for 16 bit instructions.
12343 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12344 // In this case, we still need code to extract the 16 bit operand, so it
12345 // is better to use i8 v_perm
12346 bool Is16Aligned = !(Low8 % 2);
12347
12348 return IsConsecutive && Is16Aligned;
12349}
12350
12351// Do not lower into v_perm if the operands are actually 16 bit
12352// and the selected bits (based on PermMask) correspond with two
12353// easily addressable 16 bit operands.
12355 SDValue &OtherOp) {
12356 int Low16 = PermMask & 0xffff;
12357 int Hi16 = (PermMask & 0xffff0000) >> 16;
12358
12359 auto TempOp = peekThroughBitcasts(Op);
12360 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12361
12362 auto OpIs16Bit =
12363 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12364 if (!OpIs16Bit)
12365 return true;
12366
12367 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12368 isExtendedFrom16Bits(TempOtherOp);
12369 if (!OtherOpIs16Bit)
12370 return true;
12371
12372 // Do we cleanly address both
12373 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12374}
12375
12377 unsigned DWordOffset) {
12378 SDValue Ret;
12379
12380 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12381 // ByteProvider must be at least 8 bits
12382 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12383
12384 if (TypeSize <= 32)
12385 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12386
12387 if (Src.getValueType().isVector()) {
12388 auto ScalarTySize = Src.getScalarValueSizeInBits();
12389 auto ScalarTy = Src.getValueType().getScalarType();
12390 if (ScalarTySize == 32) {
12391 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12392 DAG.getConstant(DWordOffset, SL, MVT::i32));
12393 }
12394 if (ScalarTySize > 32) {
12395 Ret = DAG.getNode(
12396 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12397 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12398 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12399 if (ShiftVal)
12400 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12401 DAG.getConstant(ShiftVal, SL, MVT::i32));
12402 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12403 }
12404
12405 assert(ScalarTySize < 32);
12406 auto NumElements = TypeSize / ScalarTySize;
12407 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12408 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12409 auto NumElementsIn32 = 32 / ScalarTySize;
12410 auto NumAvailElements = DWordOffset < Trunc32Elements
12411 ? NumElementsIn32
12412 : NumElements - NormalizedTrunc;
12413
12415 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12416 NumAvailElements);
12417
12418 Ret = DAG.getBuildVector(
12419 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12420 VecSrcs);
12421 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12422 }
12423
12424 /// Scalar Type
12425 auto ShiftVal = 32 * DWordOffset;
12426 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12427 DAG.getConstant(ShiftVal, SL, MVT::i32));
12428 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12429}
12430
12432 SelectionDAG &DAG = DCI.DAG;
12433 [[maybe_unused]] EVT VT = N->getValueType(0);
12435
12436 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12437 assert(VT == MVT::i32);
12438 for (int i = 0; i < 4; i++) {
12439 // Find the ByteProvider that provides the ith byte of the result of OR
12440 std::optional<ByteProvider<SDValue>> P =
12441 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12442 // TODO support constantZero
12443 if (!P || P->isConstantZero())
12444 return SDValue();
12445
12446 PermNodes.push_back(*P);
12447 }
12448 if (PermNodes.size() != 4)
12449 return SDValue();
12450
12451 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12452 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12453 uint64_t PermMask = 0x00000000;
12454 for (size_t i = 0; i < PermNodes.size(); i++) {
12455 auto PermOp = PermNodes[i];
12456 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12457 // by sizeof(Src2) = 4
12458 int SrcByteAdjust = 4;
12459
12460 // If the Src uses a byte from a different DWORD, then it corresponds
12461 // with a difference source
12462 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12463 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12464 if (SecondSrc)
12465 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12466 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12467 return SDValue();
12468
12469 // Set the index of the second distinct Src node
12470 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12471 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12472 SrcByteAdjust = 0;
12473 }
12474 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12476 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12477 }
12478 SDLoc DL(N);
12479 SDValue Op = *PermNodes[FirstSrc.first].Src;
12480 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12481 assert(Op.getValueSizeInBits() == 32);
12482
12483 // Check that we are not just extracting the bytes in order from an op
12484 if (!SecondSrc) {
12485 int Low16 = PermMask & 0xffff;
12486 int Hi16 = (PermMask & 0xffff0000) >> 16;
12487
12488 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12489 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12490
12491 // The perm op would really just produce Op. So combine into Op
12492 if (WellFormedLow && WellFormedHi)
12493 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12494 }
12495
12496 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12497
12498 if (SecondSrc) {
12499 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12500 assert(OtherOp.getValueSizeInBits() == 32);
12501 }
12502
12503 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12504
12505 assert(Op.getValueType().isByteSized() &&
12506 OtherOp.getValueType().isByteSized());
12507
12508 // If the ultimate src is less than 32 bits, then we will only be
12509 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12510 // CalculateByteProvider would not have returned Op as source if we
12511 // used a byte that is outside its ValueType. Thus, we are free to
12512 // ANY_EXTEND as the extended bits are dont-cares.
12513 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12514 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12515
12516 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12517 DAG.getConstant(PermMask, DL, MVT::i32));
12518 }
12519 return SDValue();
12520}
12521
12522SDValue SITargetLowering::performOrCombine(SDNode *N,
12523 DAGCombinerInfo &DCI) const {
12524 SelectionDAG &DAG = DCI.DAG;
12525 SDValue LHS = N->getOperand(0);
12526 SDValue RHS = N->getOperand(1);
12527
12528 EVT VT = N->getValueType(0);
12529 if (VT == MVT::i1) {
12530 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12531 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12532 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12533 SDValue Src = LHS.getOperand(0);
12534 if (Src != RHS.getOperand(0))
12535 return SDValue();
12536
12537 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12538 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12539 if (!CLHS || !CRHS)
12540 return SDValue();
12541
12542 // Only 10 bits are used.
12543 static const uint32_t MaxMask = 0x3ff;
12544
12545 uint32_t NewMask =
12546 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12547 SDLoc DL(N);
12548 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12549 DAG.getConstant(NewMask, DL, MVT::i32));
12550 }
12551
12552 return SDValue();
12553 }
12554
12555 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12556 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12557 LHS.getOpcode() == AMDGPUISD::PERM &&
12558 isa<ConstantSDNode>(LHS.getOperand(2))) {
12559 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12560 if (!Sel)
12561 return SDValue();
12562
12563 Sel |= LHS.getConstantOperandVal(2);
12564 SDLoc DL(N);
12565 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12566 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12567 }
12568
12569 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12571 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12572 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12573
12574 // If all the uses of an or need to extract the individual elements, do not
12575 // attempt to lower into v_perm
12576 auto usesCombinedOperand = [](SDNode *OrUse) {
12577 // If we have any non-vectorized use, then it is a candidate for v_perm
12578 if (OrUse->getOpcode() != ISD::BITCAST ||
12579 !OrUse->getValueType(0).isVector())
12580 return true;
12581
12582 // If we have any non-vectorized use, then it is a candidate for v_perm
12583 for (auto *VUser : OrUse->users()) {
12584 if (!VUser->getValueType(0).isVector())
12585 return true;
12586
12587 // If the use of a vector is a store, then combining via a v_perm
12588 // is beneficial.
12589 // TODO -- whitelist more uses
12590 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12591 if (VUser->getOpcode() == VectorwiseOp)
12592 return true;
12593 }
12594 return false;
12595 };
12596
12597 if (!any_of(N->users(), usesCombinedOperand))
12598 return SDValue();
12599
12600 uint32_t LHSMask = getPermuteMask(LHS);
12601 uint32_t RHSMask = getPermuteMask(RHS);
12602
12603 if (LHSMask != ~0u && RHSMask != ~0u) {
12604 // Canonicalize the expression in an attempt to have fewer unique masks
12605 // and therefore fewer registers used to hold the masks.
12606 if (LHSMask > RHSMask) {
12607 std::swap(LHSMask, RHSMask);
12608 std::swap(LHS, RHS);
12609 }
12610
12611 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12612 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12613 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12614 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12615
12616 // Check of we need to combine values from two sources within a byte.
12617 if (!(LHSUsedLanes & RHSUsedLanes) &&
12618 // If we select high and lower word keep it for SDWA.
12619 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12620 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12621 // Kill zero bytes selected by other mask. Zero value is 0xc.
12622 LHSMask &= ~RHSUsedLanes;
12623 RHSMask &= ~LHSUsedLanes;
12624 // Add 4 to each active LHS lane
12625 LHSMask |= LHSUsedLanes & 0x04040404;
12626 // Combine masks
12627 uint32_t Sel = LHSMask | RHSMask;
12628 SDLoc DL(N);
12629
12630 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12631 RHS.getOperand(0),
12632 DAG.getConstant(Sel, DL, MVT::i32));
12633 }
12634 }
12635 if (LHSMask == ~0u || RHSMask == ~0u) {
12636 if (SDValue Perm = matchPERM(N, DCI))
12637 return Perm;
12638 }
12639 }
12640
12641 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12642 return SDValue();
12643
12644 // TODO: This could be a generic combine with a predicate for extracting the
12645 // high half of an integer being free.
12646
12647 // (or i64:x, (zero_extend i32:y)) ->
12648 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12649 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12650 RHS.getOpcode() != ISD::ZERO_EXTEND)
12651 std::swap(LHS, RHS);
12652
12653 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12654 SDValue ExtSrc = RHS.getOperand(0);
12655 EVT SrcVT = ExtSrc.getValueType();
12656 if (SrcVT == MVT::i32) {
12657 SDLoc SL(N);
12658 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12659 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12660
12661 DCI.AddToWorklist(LowOr.getNode());
12662 DCI.AddToWorklist(HiBits.getNode());
12663
12664 SDValue Vec =
12665 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12666 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12667 }
12668 }
12669
12670 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12671 if (CRHS) {
12672 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12673 N->getOperand(0), CRHS))
12674 return Split;
12675 }
12676
12677 return SDValue();
12678}
12679
12680SDValue SITargetLowering::performXorCombine(SDNode *N,
12681 DAGCombinerInfo &DCI) const {
12682 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12683 return RV;
12684
12685 SDValue LHS = N->getOperand(0);
12686 SDValue RHS = N->getOperand(1);
12687
12688 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12689 SelectionDAG &DAG = DCI.DAG;
12690
12691 EVT VT = N->getValueType(0);
12692 if (CRHS && VT == MVT::i64) {
12693 if (SDValue Split =
12694 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12695 return Split;
12696 }
12697
12698 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12699 // fneg-like xors into 64-bit select.
12700 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12701 // This looks like an fneg, try to fold as a source modifier.
12702 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12703 shouldFoldFNegIntoSrc(N, LHS)) {
12704 // xor (select c, a, b), 0x80000000 ->
12705 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12706 SDLoc DL(N);
12707 SDValue CastLHS =
12708 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12709 SDValue CastRHS =
12710 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12711 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12712 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12713 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12714 LHS->getOperand(0), FNegLHS, FNegRHS);
12715 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12716 }
12717 }
12718
12719 return SDValue();
12720}
12721
12722SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12723 DAGCombinerInfo &DCI) const {
12724 if (!Subtarget->has16BitInsts() ||
12725 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12726 return SDValue();
12727
12728 EVT VT = N->getValueType(0);
12729 if (VT != MVT::i32)
12730 return SDValue();
12731
12732 SDValue Src = N->getOperand(0);
12733 if (Src.getValueType() != MVT::i16)
12734 return SDValue();
12735
12736 return SDValue();
12737}
12738
12739SDValue
12740SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12741 DAGCombinerInfo &DCI) const {
12742 SDValue Src = N->getOperand(0);
12743 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12744
12745 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12746 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12747 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12748 VTSign->getVT() == MVT::i8) ||
12749 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12750 VTSign->getVT() == MVT::i16))) {
12751 assert(Subtarget->hasScalarSubwordLoads() &&
12752 "s_buffer_load_{u8, i8} are supported "
12753 "in GFX12 (or newer) architectures.");
12754 EVT VT = Src.getValueType();
12755 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12758 SDLoc DL(N);
12759 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12760 SDValue Ops[] = {
12761 Src.getOperand(0), // source register
12762 Src.getOperand(1), // offset
12763 Src.getOperand(2) // cachePolicy
12764 };
12765 auto *M = cast<MemSDNode>(Src);
12766 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12767 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12768 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12769 return LoadVal;
12770 }
12771 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12772 VTSign->getVT() == MVT::i8) ||
12773 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12774 VTSign->getVT() == MVT::i16)) &&
12775 Src.hasOneUse()) {
12776 auto *M = cast<MemSDNode>(Src);
12777 SDValue Ops[] = {Src.getOperand(0), // Chain
12778 Src.getOperand(1), // rsrc
12779 Src.getOperand(2), // vindex
12780 Src.getOperand(3), // voffset
12781 Src.getOperand(4), // soffset
12782 Src.getOperand(5), // offset
12783 Src.getOperand(6), Src.getOperand(7)};
12784 // replace with BUFFER_LOAD_BYTE/SHORT
12785 SDVTList ResList =
12786 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12787 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12790 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12791 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12792 return DCI.DAG.getMergeValues(
12793 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12794 }
12795 return SDValue();
12796}
12797
12798SDValue SITargetLowering::performClassCombine(SDNode *N,
12799 DAGCombinerInfo &DCI) const {
12800 SelectionDAG &DAG = DCI.DAG;
12801 SDValue Mask = N->getOperand(1);
12802
12803 // fp_class x, 0 -> false
12804 if (isNullConstant(Mask))
12805 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12806
12807 if (N->getOperand(0).isUndef())
12808 return DAG.getUNDEF(MVT::i1);
12809
12810 return SDValue();
12811}
12812
12813SDValue SITargetLowering::performRcpCombine(SDNode *N,
12814 DAGCombinerInfo &DCI) const {
12815 EVT VT = N->getValueType(0);
12816 SDValue N0 = N->getOperand(0);
12817
12818 if (N0.isUndef()) {
12819 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12820 SDLoc(N), VT);
12821 }
12822
12823 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12824 N0.getOpcode() == ISD::SINT_TO_FP)) {
12825 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12826 N->getFlags());
12827 }
12828
12829 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12830 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12831 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12832 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12833 N->getFlags());
12834 }
12835
12837}
12838
12840 unsigned MaxDepth) const {
12841 unsigned Opcode = Op.getOpcode();
12842 if (Opcode == ISD::FCANONICALIZE)
12843 return true;
12844
12845 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12846 const auto &F = CFP->getValueAPF();
12847 if (F.isNaN() && F.isSignaling())
12848 return false;
12849 if (!F.isDenormal())
12850 return true;
12851
12852 DenormalMode Mode =
12853 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12854 return Mode == DenormalMode::getIEEE();
12855 }
12856
12857 // If source is a result of another standard FP operation it is already in
12858 // canonical form.
12859 if (MaxDepth == 0)
12860 return false;
12861
12862 switch (Opcode) {
12863 // These will flush denorms if required.
12864 case ISD::FADD:
12865 case ISD::FSUB:
12866 case ISD::FMUL:
12867 case ISD::FCEIL:
12868 case ISD::FFLOOR:
12869 case ISD::FMA:
12870 case ISD::FMAD:
12871 case ISD::FSQRT:
12872 case ISD::FDIV:
12873 case ISD::FREM:
12874 case ISD::FP_ROUND:
12875 case ISD::FP_EXTEND:
12876 case ISD::FP16_TO_FP:
12877 case ISD::FP_TO_FP16:
12878 case ISD::BF16_TO_FP:
12879 case ISD::FP_TO_BF16:
12880 case ISD::FLDEXP:
12883 case AMDGPUISD::RCP:
12884 case AMDGPUISD::RSQ:
12888 case AMDGPUISD::LOG:
12889 case AMDGPUISD::EXP:
12893 case AMDGPUISD::FRACT:
12900 case AMDGPUISD::SIN_HW:
12901 case AMDGPUISD::COS_HW:
12902 return true;
12903
12904 // It can/will be lowered or combined as a bit operation.
12905 // Need to check their input recursively to handle.
12906 case ISD::FNEG:
12907 case ISD::FABS:
12908 case ISD::FCOPYSIGN:
12909 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12910
12911 case ISD::AND:
12912 if (Op.getValueType() == MVT::i32) {
12913 // Be careful as we only know it is a bitcast floating point type. It
12914 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12915 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12916 // is valid to optimize for all types.
12917 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12918 if (RHS->getZExtValue() == 0xffff0000) {
12919 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12920 }
12921 }
12922 }
12923 break;
12924
12925 case ISD::FSIN:
12926 case ISD::FCOS:
12927 case ISD::FSINCOS:
12928 return Op.getValueType().getScalarType() != MVT::f16;
12929
12930 case ISD::FMINNUM:
12931 case ISD::FMAXNUM:
12932 case ISD::FMINNUM_IEEE:
12933 case ISD::FMAXNUM_IEEE:
12934 case ISD::FMINIMUM:
12935 case ISD::FMAXIMUM:
12936 case AMDGPUISD::CLAMP:
12937 case AMDGPUISD::FMED3:
12938 case AMDGPUISD::FMAX3:
12939 case AMDGPUISD::FMIN3:
12941 case AMDGPUISD::FMINIMUM3: {
12942 // FIXME: Shouldn't treat the generic operations different based these.
12943 // However, we aren't really required to flush the result from
12944 // minnum/maxnum..
12945
12946 // snans will be quieted, so we only need to worry about denormals.
12947 if (Subtarget->supportsMinMaxDenormModes() ||
12948 // FIXME: denormalsEnabledForType is broken for dynamic
12949 denormalsEnabledForType(DAG, Op.getValueType()))
12950 return true;
12951
12952 // Flushing may be required.
12953 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12954 // targets need to check their input recursively.
12955
12956 // FIXME: Does this apply with clamp? It's implemented with max.
12957 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12958 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12959 return false;
12960 }
12961
12962 return true;
12963 }
12964 case ISD::SELECT: {
12965 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12966 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12967 }
12968 case ISD::BUILD_VECTOR: {
12969 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12970 SDValue SrcOp = Op.getOperand(i);
12971 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12972 return false;
12973 }
12974
12975 return true;
12976 }
12979 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12980 }
12982 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12983 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12984 }
12985 case ISD::UNDEF:
12986 // Could be anything.
12987 return false;
12988
12989 case ISD::BITCAST:
12990 // TODO: This is incorrect as it loses track of the operand's type. We may
12991 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12992 // same bits that are canonicalized in one type need not be in the other.
12993 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12994 case ISD::TRUNCATE: {
12995 // Hack round the mess we make when legalizing extract_vector_elt
12996 if (Op.getValueType() == MVT::i16) {
12997 SDValue TruncSrc = Op.getOperand(0);
12998 if (TruncSrc.getValueType() == MVT::i32 &&
12999 TruncSrc.getOpcode() == ISD::BITCAST &&
13000 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13001 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13002 }
13003 }
13004 return false;
13005 }
13007 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13008 // TODO: Handle more intrinsics
13009 switch (IntrinsicID) {
13010 case Intrinsic::amdgcn_cvt_pkrtz:
13011 case Intrinsic::amdgcn_cubeid:
13012 case Intrinsic::amdgcn_frexp_mant:
13013 case Intrinsic::amdgcn_fdot2:
13014 case Intrinsic::amdgcn_rcp:
13015 case Intrinsic::amdgcn_rsq:
13016 case Intrinsic::amdgcn_rsq_clamp:
13017 case Intrinsic::amdgcn_rcp_legacy:
13018 case Intrinsic::amdgcn_rsq_legacy:
13019 case Intrinsic::amdgcn_trig_preop:
13020 case Intrinsic::amdgcn_log:
13021 case Intrinsic::amdgcn_exp2:
13022 case Intrinsic::amdgcn_sqrt:
13023 return true;
13024 default:
13025 break;
13026 }
13027
13028 break;
13029 }
13030 default:
13031 break;
13032 }
13033
13034 // FIXME: denormalsEnabledForType is broken for dynamic
13035 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13036 DAG.isKnownNeverSNaN(Op);
13037}
13038
13040 unsigned MaxDepth) const {
13041 const MachineRegisterInfo &MRI = MF.getRegInfo();
13042 MachineInstr *MI = MRI.getVRegDef(Reg);
13043 unsigned Opcode = MI->getOpcode();
13044
13045 if (Opcode == AMDGPU::G_FCANONICALIZE)
13046 return true;
13047
13048 std::optional<FPValueAndVReg> FCR;
13049 // Constant splat (can be padded with undef) or scalar constant.
13050 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13051 if (FCR->Value.isSignaling())
13052 return false;
13053 if (!FCR->Value.isDenormal())
13054 return true;
13055
13056 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13057 return Mode == DenormalMode::getIEEE();
13058 }
13059
13060 if (MaxDepth == 0)
13061 return false;
13062
13063 switch (Opcode) {
13064 case AMDGPU::G_FADD:
13065 case AMDGPU::G_FSUB:
13066 case AMDGPU::G_FMUL:
13067 case AMDGPU::G_FCEIL:
13068 case AMDGPU::G_FFLOOR:
13069 case AMDGPU::G_FRINT:
13070 case AMDGPU::G_FNEARBYINT:
13071 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13072 case AMDGPU::G_INTRINSIC_TRUNC:
13073 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13074 case AMDGPU::G_FMA:
13075 case AMDGPU::G_FMAD:
13076 case AMDGPU::G_FSQRT:
13077 case AMDGPU::G_FDIV:
13078 case AMDGPU::G_FREM:
13079 case AMDGPU::G_FPOW:
13080 case AMDGPU::G_FPEXT:
13081 case AMDGPU::G_FLOG:
13082 case AMDGPU::G_FLOG2:
13083 case AMDGPU::G_FLOG10:
13084 case AMDGPU::G_FPTRUNC:
13085 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13086 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13087 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13088 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13089 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13090 return true;
13091 case AMDGPU::G_FNEG:
13092 case AMDGPU::G_FABS:
13093 case AMDGPU::G_FCOPYSIGN:
13094 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13095 case AMDGPU::G_FMINNUM:
13096 case AMDGPU::G_FMAXNUM:
13097 case AMDGPU::G_FMINNUM_IEEE:
13098 case AMDGPU::G_FMAXNUM_IEEE:
13099 case AMDGPU::G_FMINIMUM:
13100 case AMDGPU::G_FMAXIMUM: {
13101 if (Subtarget->supportsMinMaxDenormModes() ||
13102 // FIXME: denormalsEnabledForType is broken for dynamic
13103 denormalsEnabledForType(MRI.getType(Reg), MF))
13104 return true;
13105
13106 [[fallthrough]];
13107 }
13108 case AMDGPU::G_BUILD_VECTOR:
13109 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13110 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13111 return false;
13112 return true;
13113 case AMDGPU::G_INTRINSIC:
13114 case AMDGPU::G_INTRINSIC_CONVERGENT:
13115 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13116 case Intrinsic::amdgcn_fmul_legacy:
13117 case Intrinsic::amdgcn_fmad_ftz:
13118 case Intrinsic::amdgcn_sqrt:
13119 case Intrinsic::amdgcn_fmed3:
13120 case Intrinsic::amdgcn_sin:
13121 case Intrinsic::amdgcn_cos:
13122 case Intrinsic::amdgcn_log:
13123 case Intrinsic::amdgcn_exp2:
13124 case Intrinsic::amdgcn_log_clamp:
13125 case Intrinsic::amdgcn_rcp:
13126 case Intrinsic::amdgcn_rcp_legacy:
13127 case Intrinsic::amdgcn_rsq:
13128 case Intrinsic::amdgcn_rsq_clamp:
13129 case Intrinsic::amdgcn_rsq_legacy:
13130 case Intrinsic::amdgcn_div_scale:
13131 case Intrinsic::amdgcn_div_fmas:
13132 case Intrinsic::amdgcn_div_fixup:
13133 case Intrinsic::amdgcn_fract:
13134 case Intrinsic::amdgcn_cvt_pkrtz:
13135 case Intrinsic::amdgcn_cubeid:
13136 case Intrinsic::amdgcn_cubema:
13137 case Intrinsic::amdgcn_cubesc:
13138 case Intrinsic::amdgcn_cubetc:
13139 case Intrinsic::amdgcn_frexp_mant:
13140 case Intrinsic::amdgcn_fdot2:
13141 case Intrinsic::amdgcn_trig_preop:
13142 return true;
13143 default:
13144 break;
13145 }
13146
13147 [[fallthrough]];
13148 default:
13149 return false;
13150 }
13151
13152 llvm_unreachable("invalid operation");
13153}
13154
13155// Constant fold canonicalize.
13156SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13157 const SDLoc &SL, EVT VT,
13158 const APFloat &C) const {
13159 // Flush denormals to 0 if not enabled.
13160 if (C.isDenormal()) {
13161 DenormalMode Mode =
13162 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13163 if (Mode == DenormalMode::getPreserveSign()) {
13164 return DAG.getConstantFP(
13165 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13166 }
13167
13168 if (Mode != DenormalMode::getIEEE())
13169 return SDValue();
13170 }
13171
13172 if (C.isNaN()) {
13173 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13174 if (C.isSignaling()) {
13175 // Quiet a signaling NaN.
13176 // FIXME: Is this supposed to preserve payload bits?
13177 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13178 }
13179
13180 // Make sure it is the canonical NaN bitpattern.
13181 //
13182 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13183 // immediate?
13184 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13185 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13186 }
13187
13188 // Already canonical.
13189 return DAG.getConstantFP(C, SL, VT);
13190}
13191
13193 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13194}
13195
13196SDValue
13197SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13198 DAGCombinerInfo &DCI) const {
13199 SelectionDAG &DAG = DCI.DAG;
13200 SDValue N0 = N->getOperand(0);
13201 EVT VT = N->getValueType(0);
13202
13203 // fcanonicalize undef -> qnan
13204 if (N0.isUndef()) {
13206 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13207 }
13208
13209 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13210 EVT VT = N->getValueType(0);
13211 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13212 }
13213
13214 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13215 // (fcanonicalize k)
13216 //
13217 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13218
13219 // TODO: This could be better with wider vectors that will be split to v2f16,
13220 // and to consider uses since there aren't that many packed operations.
13221 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13222 isTypeLegal(MVT::v2f16)) {
13223 SDLoc SL(N);
13224 SDValue NewElts[2];
13225 SDValue Lo = N0.getOperand(0);
13226 SDValue Hi = N0.getOperand(1);
13227 EVT EltVT = Lo.getValueType();
13228
13230 for (unsigned I = 0; I != 2; ++I) {
13231 SDValue Op = N0.getOperand(I);
13232 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13233 NewElts[I] =
13234 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13235 } else if (Op.isUndef()) {
13236 // Handled below based on what the other operand is.
13237 NewElts[I] = Op;
13238 } else {
13239 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13240 }
13241 }
13242
13243 // If one half is undef, and one is constant, prefer a splat vector rather
13244 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13245 // cheaper to use and may be free with a packed operation.
13246 if (NewElts[0].isUndef()) {
13247 if (isa<ConstantFPSDNode>(NewElts[1]))
13248 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13249 ? NewElts[1]
13250 : DAG.getConstantFP(0.0f, SL, EltVT);
13251 }
13252
13253 if (NewElts[1].isUndef()) {
13254 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13255 ? NewElts[0]
13256 : DAG.getConstantFP(0.0f, SL, EltVT);
13257 }
13258
13259 return DAG.getBuildVector(VT, SL, NewElts);
13260 }
13261 }
13262
13263 return SDValue();
13264}
13265
13266static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13267 switch (Opc) {
13268 case ISD::FMAXNUM:
13269 case ISD::FMAXNUM_IEEE:
13270 return AMDGPUISD::FMAX3;
13271 case ISD::FMAXIMUM:
13272 return AMDGPUISD::FMAXIMUM3;
13273 case ISD::SMAX:
13274 return AMDGPUISD::SMAX3;
13275 case ISD::UMAX:
13276 return AMDGPUISD::UMAX3;
13277 case ISD::FMINNUM:
13278 case ISD::FMINNUM_IEEE:
13279 return AMDGPUISD::FMIN3;
13280 case ISD::FMINIMUM:
13281 return AMDGPUISD::FMINIMUM3;
13282 case ISD::SMIN:
13283 return AMDGPUISD::SMIN3;
13284 case ISD::UMIN:
13285 return AMDGPUISD::UMIN3;
13286 default:
13287 llvm_unreachable("Not a min/max opcode");
13288 }
13289}
13290
13291SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13292 const SDLoc &SL, SDValue Src,
13293 SDValue MinVal,
13294 SDValue MaxVal,
13295 bool Signed) const {
13296
13297 // med3 comes from
13298 // min(max(x, K0), K1), K0 < K1
13299 // max(min(x, K0), K1), K1 < K0
13300 //
13301 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13302 // min/max op.
13303 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13304 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13305
13306 if (!MinK || !MaxK)
13307 return SDValue();
13308
13309 if (Signed) {
13310 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13311 return SDValue();
13312 } else {
13313 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13314 return SDValue();
13315 }
13316
13317 EVT VT = MinK->getValueType(0);
13318 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13319 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13320 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13321
13322 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13323 // not available, but this is unlikely to be profitable as constants
13324 // will often need to be materialized & extended, especially on
13325 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13326 return SDValue();
13327}
13328
13330 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13331 return C;
13332
13333 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13334 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13335 return C;
13336 }
13337
13338 return nullptr;
13339}
13340
13341SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13342 const SDLoc &SL, SDValue Op0,
13343 SDValue Op1) const {
13345 if (!K1)
13346 return SDValue();
13347
13349 if (!K0)
13350 return SDValue();
13351
13352 // Ordered >= (although NaN inputs should have folded away by now).
13353 if (K0->getValueAPF() > K1->getValueAPF())
13354 return SDValue();
13355
13356 const MachineFunction &MF = DAG.getMachineFunction();
13358
13359 // TODO: Check IEEE bit enabled?
13360 EVT VT = Op0.getValueType();
13361 if (Info->getMode().DX10Clamp) {
13362 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13363 // hardware fmed3 behavior converting to a min.
13364 // FIXME: Should this be allowing -0.0?
13365 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13366 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13367 }
13368
13369 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13370 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13371 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13372 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13373 // then give the other result, which is different from med3 with a NaN
13374 // input.
13375 SDValue Var = Op0.getOperand(0);
13376 if (!DAG.isKnownNeverSNaN(Var))
13377 return SDValue();
13378
13380
13381 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13382 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13383 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13384 SDValue(K0, 0), SDValue(K1, 0));
13385 }
13386 }
13387
13388 return SDValue();
13389}
13390
13391/// \return true if the subtarget supports minimum3 and maximum3 with the given
13392/// base min/max opcode \p Opc for type \p VT.
13393static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13394 EVT VT) {
13395 switch (Opc) {
13396 case ISD::FMINNUM:
13397 case ISD::FMAXNUM:
13398 case ISD::FMINNUM_IEEE:
13399 case ISD::FMAXNUM_IEEE:
13402 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13403 case ISD::FMINIMUM:
13404 case ISD::FMAXIMUM:
13405 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13406 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13407 case ISD::SMAX:
13408 case ISD::SMIN:
13409 case ISD::UMAX:
13410 case ISD::UMIN:
13411 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13412 default:
13413 return false;
13414 }
13415
13416 llvm_unreachable("not a min/max opcode");
13417}
13418
13419SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13420 DAGCombinerInfo &DCI) const {
13421 SelectionDAG &DAG = DCI.DAG;
13422
13423 EVT VT = N->getValueType(0);
13424 unsigned Opc = N->getOpcode();
13425 SDValue Op0 = N->getOperand(0);
13426 SDValue Op1 = N->getOperand(1);
13427
13428 // Only do this if the inner op has one use since this will just increases
13429 // register pressure for no benefit.
13430
13431 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13432 // max(max(a, b), c) -> max3(a, b, c)
13433 // min(min(a, b), c) -> min3(a, b, c)
13434 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13435 SDLoc DL(N);
13436 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13437 Op0.getOperand(0), Op0.getOperand(1), Op1);
13438 }
13439
13440 // Try commuted.
13441 // max(a, max(b, c)) -> max3(a, b, c)
13442 // min(a, min(b, c)) -> min3(a, b, c)
13443 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13444 SDLoc DL(N);
13445 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13446 Op0, Op1.getOperand(0), Op1.getOperand(1));
13447 }
13448 }
13449
13450 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13451 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13452 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13453 if (SDValue Med3 = performIntMed3ImmCombine(
13454 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13455 return Med3;
13456 }
13457 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13458 if (SDValue Med3 = performIntMed3ImmCombine(
13459 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13460 return Med3;
13461 }
13462
13463 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13464 if (SDValue Med3 = performIntMed3ImmCombine(
13465 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13466 return Med3;
13467 }
13468 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13469 if (SDValue Med3 = performIntMed3ImmCombine(
13470 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13471 return Med3;
13472 }
13473
13474 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13475 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13476 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13477 (Opc == AMDGPUISD::FMIN_LEGACY &&
13478 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13479 (VT == MVT::f32 || VT == MVT::f64 ||
13480 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13481 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13482 Op0.hasOneUse()) {
13483 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13484 return Res;
13485 }
13486
13487 return SDValue();
13488}
13489
13491 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13492 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13493 // FIXME: Should this be allowing -0.0?
13494 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13495 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13496 }
13497 }
13498
13499 return false;
13500}
13501
13502// FIXME: Should only worry about snans for version with chain.
13503SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13504 DAGCombinerInfo &DCI) const {
13505 EVT VT = N->getValueType(0);
13506 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13507 // NaNs. With a NaN input, the order of the operands may change the result.
13508
13509 SelectionDAG &DAG = DCI.DAG;
13510 SDLoc SL(N);
13511
13512 SDValue Src0 = N->getOperand(0);
13513 SDValue Src1 = N->getOperand(1);
13514 SDValue Src2 = N->getOperand(2);
13515
13516 if (isClampZeroToOne(Src0, Src1)) {
13517 // const_a, const_b, x -> clamp is safe in all cases including signaling
13518 // nans.
13519 // FIXME: Should this be allowing -0.0?
13520 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13521 }
13522
13523 const MachineFunction &MF = DAG.getMachineFunction();
13525
13526 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13527 // handling no dx10-clamp?
13528 if (Info->getMode().DX10Clamp) {
13529 // If NaNs is clamped to 0, we are free to reorder the inputs.
13530
13531 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13532 std::swap(Src0, Src1);
13533
13534 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13535 std::swap(Src1, Src2);
13536
13537 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13538 std::swap(Src0, Src1);
13539
13540 if (isClampZeroToOne(Src1, Src2))
13541 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13542 }
13543
13544 return SDValue();
13545}
13546
13547SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13548 DAGCombinerInfo &DCI) const {
13549 SDValue Src0 = N->getOperand(0);
13550 SDValue Src1 = N->getOperand(1);
13551 if (Src0.isUndef() && Src1.isUndef())
13552 return DCI.DAG.getUNDEF(N->getValueType(0));
13553 return SDValue();
13554}
13555
13556// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13557// expanded into a set of cmp/select instructions.
13559 unsigned NumElem,
13560 bool IsDivergentIdx,
13561 const GCNSubtarget *Subtarget) {
13563 return false;
13564
13565 unsigned VecSize = EltSize * NumElem;
13566
13567 // Sub-dword vectors of size 2 dword or less have better implementation.
13568 if (VecSize <= 64 && EltSize < 32)
13569 return false;
13570
13571 // Always expand the rest of sub-dword instructions, otherwise it will be
13572 // lowered via memory.
13573 if (EltSize < 32)
13574 return true;
13575
13576 // Always do this if var-idx is divergent, otherwise it will become a loop.
13577 if (IsDivergentIdx)
13578 return true;
13579
13580 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13581 unsigned NumInsts = NumElem /* Number of compares */ +
13582 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13583
13584 // On some architectures (GFX9) movrel is not available and it's better
13585 // to expand.
13586 if (Subtarget->useVGPRIndexMode())
13587 return NumInsts <= 16;
13588
13589 // If movrel is available, use it instead of expanding for vector of 8
13590 // elements.
13591 if (Subtarget->hasMovrel())
13592 return NumInsts <= 15;
13593
13594 return true;
13595}
13596
13598 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13599 if (isa<ConstantSDNode>(Idx))
13600 return false;
13601
13602 SDValue Vec = N->getOperand(0);
13603 EVT VecVT = Vec.getValueType();
13604 EVT EltVT = VecVT.getVectorElementType();
13605 unsigned EltSize = EltVT.getSizeInBits();
13606 unsigned NumElem = VecVT.getVectorNumElements();
13607
13609 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13610}
13611
13612SDValue
13613SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13614 DAGCombinerInfo &DCI) const {
13615 SDValue Vec = N->getOperand(0);
13616 SelectionDAG &DAG = DCI.DAG;
13617
13618 EVT VecVT = Vec.getValueType();
13619 EVT VecEltVT = VecVT.getVectorElementType();
13620 EVT ResVT = N->getValueType(0);
13621
13622 unsigned VecSize = VecVT.getSizeInBits();
13623 unsigned VecEltSize = VecEltVT.getSizeInBits();
13624
13625 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13627 SDLoc SL(N);
13628 SDValue Idx = N->getOperand(1);
13629 SDValue Elt =
13630 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13631 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13632 }
13633
13634 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13635 // =>
13636 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13637 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13638 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13639 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13640 SDLoc SL(N);
13641 SDValue Idx = N->getOperand(1);
13642 unsigned Opc = Vec.getOpcode();
13643
13644 switch (Opc) {
13645 default:
13646 break;
13647 // TODO: Support other binary operations.
13648 case ISD::FADD:
13649 case ISD::FSUB:
13650 case ISD::FMUL:
13651 case ISD::ADD:
13652 case ISD::UMIN:
13653 case ISD::UMAX:
13654 case ISD::SMIN:
13655 case ISD::SMAX:
13656 case ISD::FMAXNUM:
13657 case ISD::FMINNUM:
13658 case ISD::FMAXNUM_IEEE:
13659 case ISD::FMINNUM_IEEE:
13660 case ISD::FMAXIMUM:
13661 case ISD::FMINIMUM: {
13662 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13663 Vec.getOperand(0), Idx);
13664 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13665 Vec.getOperand(1), Idx);
13666
13667 DCI.AddToWorklist(Elt0.getNode());
13668 DCI.AddToWorklist(Elt1.getNode());
13669 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13670 }
13671 }
13672 }
13673
13674 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13676 SDLoc SL(N);
13677 SDValue Idx = N->getOperand(1);
13678 SDValue V;
13679 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13680 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13681 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13682 if (I == 0)
13683 V = Elt;
13684 else
13685 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13686 }
13687 return V;
13688 }
13689
13690 if (!DCI.isBeforeLegalize())
13691 return SDValue();
13692
13693 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13694 // elements. This exposes more load reduction opportunities by replacing
13695 // multiple small extract_vector_elements with a single 32-bit extract.
13696 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13697 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13698 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13699 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13700
13701 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13702 unsigned EltIdx = BitIndex / 32;
13703 unsigned LeftoverBitIdx = BitIndex % 32;
13704 SDLoc SL(N);
13705
13706 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13707 DCI.AddToWorklist(Cast.getNode());
13708
13709 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13710 DAG.getConstant(EltIdx, SL, MVT::i32));
13711 DCI.AddToWorklist(Elt.getNode());
13712 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13713 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13714 DCI.AddToWorklist(Srl.getNode());
13715
13716 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13717 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13718 DCI.AddToWorklist(Trunc.getNode());
13719
13720 if (VecEltVT == ResVT) {
13721 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13722 }
13723
13724 assert(ResVT.isScalarInteger());
13725 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13726 }
13727
13728 return SDValue();
13729}
13730
13731SDValue
13732SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13733 DAGCombinerInfo &DCI) const {
13734 SDValue Vec = N->getOperand(0);
13735 SDValue Idx = N->getOperand(2);
13736 EVT VecVT = Vec.getValueType();
13737 EVT EltVT = VecVT.getVectorElementType();
13738
13739 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13740 // => BUILD_VECTOR n x select (e, const-idx)
13742 return SDValue();
13743
13744 SelectionDAG &DAG = DCI.DAG;
13745 SDLoc SL(N);
13746 SDValue Ins = N->getOperand(1);
13747 EVT IdxVT = Idx.getValueType();
13748
13750 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13751 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13752 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13753 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13754 Ops.push_back(V);
13755 }
13756
13757 return DAG.getBuildVector(VecVT, SL, Ops);
13758}
13759
13760/// Return the source of an fp_extend from f16 to f32, or a converted FP
13761/// constant.
13763 if (Src.getOpcode() == ISD::FP_EXTEND &&
13764 Src.getOperand(0).getValueType() == MVT::f16) {
13765 return Src.getOperand(0);
13766 }
13767
13768 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13769 APFloat Val = CFP->getValueAPF();
13770 bool LosesInfo = true;
13772 if (!LosesInfo)
13773 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13774 }
13775
13776 return SDValue();
13777}
13778
13779SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13780 DAGCombinerInfo &DCI) const {
13781 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13782 "combine only useful on gfx8");
13783
13784 SDValue TruncSrc = N->getOperand(0);
13785 EVT VT = N->getValueType(0);
13786 if (VT != MVT::f16)
13787 return SDValue();
13788
13789 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13790 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13791 return SDValue();
13792
13793 SelectionDAG &DAG = DCI.DAG;
13794 SDLoc SL(N);
13795
13796 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13797 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13798 // casting back.
13799
13800 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13801 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13802 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13803 if (!A)
13804 return SDValue();
13805
13806 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13807 if (!B)
13808 return SDValue();
13809
13810 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13811 if (!C)
13812 return SDValue();
13813
13814 // This changes signaling nan behavior. If an input is a signaling nan, it
13815 // would have been quieted by the fpext originally. We don't care because
13816 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13817 // we would be worse off than just doing the promotion.
13818 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13819 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13820 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13821 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13822}
13823
13824unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13825 const SDNode *N0,
13826 const SDNode *N1) const {
13827 EVT VT = N0->getValueType(0);
13828
13829 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13830 // support denormals ever.
13831 if (((VT == MVT::f32 &&
13833 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13836 return ISD::FMAD;
13837
13838 const TargetOptions &Options = DAG.getTarget().Options;
13839 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13840 (N0->getFlags().hasAllowContract() &&
13841 N1->getFlags().hasAllowContract())) &&
13843 return ISD::FMA;
13844 }
13845
13846 return 0;
13847}
13848
13849// For a reassociatable opcode perform:
13850// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13851SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13852 SelectionDAG &DAG) const {
13853 EVT VT = N->getValueType(0);
13854 if (VT != MVT::i32 && VT != MVT::i64)
13855 return SDValue();
13856
13857 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13858 return SDValue();
13859
13860 unsigned Opc = N->getOpcode();
13861 SDValue Op0 = N->getOperand(0);
13862 SDValue Op1 = N->getOperand(1);
13863
13864 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13865 return SDValue();
13866
13867 if (Op0->isDivergent())
13868 std::swap(Op0, Op1);
13869
13870 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13871 return SDValue();
13872
13873 SDValue Op2 = Op1.getOperand(1);
13874 Op1 = Op1.getOperand(0);
13875 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13876 return SDValue();
13877
13878 if (Op1->isDivergent())
13879 std::swap(Op1, Op2);
13880
13881 SDLoc SL(N);
13882 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13883 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13884}
13885
13886static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13887 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13889 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13890 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13891 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13892}
13893
13894// Fold
13895// y = lshr i64 x, 32
13896// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
13897// with Const.hi == -1
13898// To
13899// res = mad_u64_u32 y.lo ,Const.lo, x.lo
13901 SDValue MulLHS, SDValue MulRHS,
13902 SDValue AddRHS) {
13903 if (MulRHS.getOpcode() == ISD::SRL)
13904 std::swap(MulLHS, MulRHS);
13905
13906 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
13907 return SDValue();
13908
13909 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
13910 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
13911 MulLHS.getOperand(0) != AddRHS)
13912 return SDValue();
13913
13914 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
13915 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
13916 return SDValue();
13917
13918 SDValue ConstMul =
13919 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
13920 return getMad64_32(DAG, SL, MVT::i64,
13921 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
13922 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
13923}
13924
13925// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13926// multiplies, if any.
13927//
13928// Full 64-bit multiplies that feed into an addition are lowered here instead
13929// of using the generic expansion. The generic expansion ends up with
13930// a tree of ADD nodes that prevents us from using the "add" part of the
13931// MAD instruction. The expansion produced here results in a chain of ADDs
13932// instead of a tree.
13933SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13934 DAGCombinerInfo &DCI) const {
13935 assert(N->getOpcode() == ISD::ADD);
13936
13937 SelectionDAG &DAG = DCI.DAG;
13938 EVT VT = N->getValueType(0);
13939 SDLoc SL(N);
13940 SDValue LHS = N->getOperand(0);
13941 SDValue RHS = N->getOperand(1);
13942
13943 if (VT.isVector())
13944 return SDValue();
13945
13946 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13947 // result in scalar registers for uniform values.
13948 if (!N->isDivergent() && Subtarget->hasSMulHi())
13949 return SDValue();
13950
13951 unsigned NumBits = VT.getScalarSizeInBits();
13952 if (NumBits <= 32 || NumBits > 64)
13953 return SDValue();
13954
13955 if (LHS.getOpcode() != ISD::MUL) {
13956 assert(RHS.getOpcode() == ISD::MUL);
13957 std::swap(LHS, RHS);
13958 }
13959
13960 // Avoid the fold if it would unduly increase the number of multiplies due to
13961 // multiple uses, except on hardware with full-rate multiply-add (which is
13962 // part of full-rate 64-bit ops).
13963 if (!Subtarget->hasFullRate64Ops()) {
13964 unsigned NumUsers = 0;
13965 for (SDNode *User : LHS->users()) {
13966 // There is a use that does not feed into addition, so the multiply can't
13967 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13968 if (User->getOpcode() != ISD::ADD)
13969 return SDValue();
13970
13971 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13972 // MUL + 3xADD + 3xADDC over 3xMAD.
13973 ++NumUsers;
13974 if (NumUsers >= 3)
13975 return SDValue();
13976 }
13977 }
13978
13979 SDValue MulLHS = LHS.getOperand(0);
13980 SDValue MulRHS = LHS.getOperand(1);
13981 SDValue AddRHS = RHS;
13982
13983 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
13984 return FoldedMAD;
13985
13986 // Always check whether operands are small unsigned values, since that
13987 // knowledge is useful in more cases. Check for small signed values only if
13988 // doing so can unlock a shorter code sequence.
13989 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13990 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13991
13992 bool MulSignedLo = false;
13993 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13994 MulSignedLo =
13995 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
13996 }
13997
13998 // The operands and final result all have the same number of bits. If
13999 // operands need to be extended, they can be extended with garbage. The
14000 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14001 // truncated away in the end.
14002 if (VT != MVT::i64) {
14003 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14004 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14005 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14006 }
14007
14008 // The basic code generated is conceptually straightforward. Pseudo code:
14009 //
14010 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14011 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14012 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14013 //
14014 // The second and third lines are optional, depending on whether the factors
14015 // are {sign,zero}-extended or not.
14016 //
14017 // The actual DAG is noisier than the pseudo code, but only due to
14018 // instructions that disassemble values into low and high parts, and
14019 // assemble the final result.
14020 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14021
14022 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14023 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14024 SDValue Accum =
14025 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14026
14027 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14028 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14029
14030 if (!MulLHSUnsigned32) {
14031 auto MulLHSHi =
14032 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14033 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14034 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14035 }
14036
14037 if (!MulRHSUnsigned32) {
14038 auto MulRHSHi =
14039 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14040 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14041 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14042 }
14043
14044 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14045 Accum = DAG.getBitcast(MVT::i64, Accum);
14046 }
14047
14048 if (VT != MVT::i64)
14049 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14050 return Accum;
14051}
14052
14053SDValue
14054SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14055 DAGCombinerInfo &DCI) const {
14056 SDValue RHS = N->getOperand(1);
14057 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14058 if (!CRHS)
14059 return SDValue();
14060
14061 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14062 // common.
14063 uint64_t Val = CRHS->getZExtValue();
14064 if (countr_zero(Val) >= 32) {
14065 SelectionDAG &DAG = DCI.DAG;
14066 SDLoc SL(N);
14067 SDValue LHS = N->getOperand(0);
14068
14069 // Avoid carry machinery if we know the low half of the add does not
14070 // contribute to the final result.
14071 //
14072 // add i64:x, K if computeTrailingZeros(K) >= 32
14073 // => build_pair (add x.hi, K.hi), x.lo
14074
14075 // Breaking the 64-bit add here with this strange constant is unlikely
14076 // to interfere with addressing mode patterns.
14077
14078 SDValue Hi = getHiHalf64(LHS, DAG);
14079 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14080 SDValue AddHi =
14081 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14082
14083 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14084 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14085 }
14086
14087 return SDValue();
14088}
14089
14090// Collect the ultimate src of each of the mul node's operands, and confirm
14091// each operand is 8 bytes.
14092static std::optional<ByteProvider<SDValue>>
14093handleMulOperand(const SDValue &MulOperand) {
14094 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14095 if (!Byte0 || Byte0->isConstantZero()) {
14096 return std::nullopt;
14097 }
14098 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14099 if (Byte1 && !Byte1->isConstantZero()) {
14100 return std::nullopt;
14101 }
14102 return Byte0;
14103}
14104
14105static unsigned addPermMasks(unsigned First, unsigned Second) {
14106 unsigned FirstCs = First & 0x0c0c0c0c;
14107 unsigned SecondCs = Second & 0x0c0c0c0c;
14108 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14109 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14110
14111 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14112 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14113 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14114 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14115
14116 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14117}
14118
14119struct DotSrc {
14121 int64_t PermMask;
14123};
14124
14128 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14129
14130 assert(Src0.Src.has_value() && Src1.Src.has_value());
14131 // Src0s and Src1s are empty, just place arbitrarily.
14132 if (Step == 0) {
14133 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14134 Src0.SrcOffset / 4});
14135 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14136 Src1.SrcOffset / 4});
14137 return;
14138 }
14139
14140 for (int BPI = 0; BPI < 2; BPI++) {
14141 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14142 if (BPI == 1) {
14143 BPP = {Src1, Src0};
14144 }
14145 unsigned ZeroMask = 0x0c0c0c0c;
14146 unsigned FMask = 0xFF << (8 * (3 - Step));
14147
14148 unsigned FirstMask =
14149 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14150 unsigned SecondMask =
14151 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14152 // Attempt to find Src vector which contains our SDValue, if so, add our
14153 // perm mask to the existing one. If we are unable to find a match for the
14154 // first SDValue, attempt to find match for the second.
14155 int FirstGroup = -1;
14156 for (int I = 0; I < 2; I++) {
14157 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14158 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14159 return IterElt.SrcOp == *BPP.first.Src &&
14160 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14161 };
14162
14163 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14164 if (Match != Srcs.end()) {
14165 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14166 FirstGroup = I;
14167 break;
14168 }
14169 }
14170 if (FirstGroup != -1) {
14171 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14172 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14173 return IterElt.SrcOp == *BPP.second.Src &&
14174 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14175 };
14176 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14177 if (Match != Srcs.end()) {
14178 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14179 } else
14180 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14181 return;
14182 }
14183 }
14184
14185 // If we have made it here, then we could not find a match in Src0s or Src1s
14186 // for either Src0 or Src1, so just place them arbitrarily.
14187
14188 unsigned ZeroMask = 0x0c0c0c0c;
14189 unsigned FMask = 0xFF << (8 * (3 - Step));
14190
14191 Src0s.push_back(
14192 {*Src0.Src,
14193 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14194 Src0.SrcOffset / 4});
14195 Src1s.push_back(
14196 {*Src1.Src,
14197 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14198 Src1.SrcOffset / 4});
14199}
14200
14202 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14203 bool IsAny) {
14204
14205 // If we just have one source, just permute it accordingly.
14206 if (Srcs.size() == 1) {
14207 auto *Elt = Srcs.begin();
14208 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14209
14210 // v_perm will produce the original value
14211 if (Elt->PermMask == 0x3020100)
14212 return EltOp;
14213
14214 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14215 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14216 }
14217
14218 auto *FirstElt = Srcs.begin();
14219 auto *SecondElt = std::next(FirstElt);
14220
14222
14223 // If we have multiple sources in the chain, combine them via perms (using
14224 // calculated perm mask) and Ors.
14225 while (true) {
14226 auto FirstMask = FirstElt->PermMask;
14227 auto SecondMask = SecondElt->PermMask;
14228
14229 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14230 unsigned FirstPlusFour = FirstMask | 0x04040404;
14231 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14232 // original 0x0C.
14233 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14234
14235 auto PermMask = addPermMasks(FirstMask, SecondMask);
14236 auto FirstVal =
14237 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14238 auto SecondVal =
14239 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14240
14241 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14242 SecondVal,
14243 DAG.getConstant(PermMask, SL, MVT::i32)));
14244
14245 FirstElt = std::next(SecondElt);
14246 if (FirstElt == Srcs.end())
14247 break;
14248
14249 SecondElt = std::next(FirstElt);
14250 // If we only have a FirstElt, then just combine that into the cumulative
14251 // source node.
14252 if (SecondElt == Srcs.end()) {
14253 auto EltOp =
14254 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14255
14256 Perms.push_back(
14257 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14258 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14259 break;
14260 }
14261 }
14262
14263 assert(Perms.size() == 1 || Perms.size() == 2);
14264 return Perms.size() == 2
14265 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14266 : Perms[0];
14267}
14268
14269static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14270 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14271 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14272 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14273 EntryMask += ZeroMask;
14274 }
14275}
14276
14277static bool isMul(const SDValue Op) {
14278 auto Opcode = Op.getOpcode();
14279
14280 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14281 Opcode == AMDGPUISD::MUL_I24);
14282}
14283
14284static std::optional<bool>
14286 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14287 const SDValue &S1Op, const SelectionDAG &DAG) {
14288 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14289 // of the dot4 is irrelevant.
14290 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14291 return false;
14292
14293 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14294 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14295 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14296 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14297 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14298 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14299
14300 assert(!(S0IsUnsigned && S0IsSigned));
14301 assert(!(S1IsUnsigned && S1IsSigned));
14302
14303 // There are 9 possible permutations of
14304 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14305
14306 // In two permutations, the sign bits are known to be the same for both Ops,
14307 // so simply return Signed / Unsigned corresponding to the MSB
14308
14309 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14310 return S0IsSigned;
14311
14312 // In another two permutations, the sign bits are known to be opposite. In
14313 // this case return std::nullopt to indicate a bad match.
14314
14315 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14316 return std::nullopt;
14317
14318 // In the remaining five permutations, we don't know the value of the sign
14319 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14320 // the upper bits must be extension bits. Thus, the only ways for the sign
14321 // bit to be unknown is if it was sign extended from unknown value, or if it
14322 // was any extended. In either case, it is correct to use the signed
14323 // version of the signedness semantics of dot4
14324
14325 // In two of such permutations, we known the sign bit is set for
14326 // one op, and the other is unknown. It is okay to used signed version of
14327 // dot4.
14328 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14329 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14330 return true;
14331
14332 // In one such permutation, we don't know either of the sign bits. It is okay
14333 // to used the signed version of dot4.
14334 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14335 return true;
14336
14337 // In two of such permutations, we known the sign bit is unset for
14338 // one op, and the other is unknown. Return std::nullopt to indicate a
14339 // bad match.
14340 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14341 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14342 return std::nullopt;
14343
14344 llvm_unreachable("Fully covered condition");
14345}
14346
14347SDValue SITargetLowering::performAddCombine(SDNode *N,
14348 DAGCombinerInfo &DCI) const {
14349 SelectionDAG &DAG = DCI.DAG;
14350 EVT VT = N->getValueType(0);
14351 SDLoc SL(N);
14352 SDValue LHS = N->getOperand(0);
14353 SDValue RHS = N->getOperand(1);
14354
14355 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14356 if (Subtarget->hasMad64_32()) {
14357 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14358 return Folded;
14359 }
14360 }
14361
14362 if (SDValue V = reassociateScalarOps(N, DAG)) {
14363 return V;
14364 }
14365
14366 if (VT == MVT::i64) {
14367 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14368 return Folded;
14369 }
14370
14371 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14372 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14373 SDValue TempNode(N, 0);
14374 std::optional<bool> IsSigned;
14378
14379 // Match the v_dot4 tree, while collecting src nodes.
14380 int ChainLength = 0;
14381 for (int I = 0; I < 4; I++) {
14382 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14383 if (MulIdx == -1)
14384 break;
14385 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14386 if (!Src0)
14387 break;
14388 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14389 if (!Src1)
14390 break;
14391
14392 auto IterIsSigned = checkDot4MulSignedness(
14393 TempNode->getOperand(MulIdx), *Src0, *Src1,
14394 TempNode->getOperand(MulIdx)->getOperand(0),
14395 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14396 if (!IterIsSigned)
14397 break;
14398 if (!IsSigned)
14399 IsSigned = *IterIsSigned;
14400 if (*IterIsSigned != *IsSigned)
14401 break;
14402 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14403 auto AddIdx = 1 - MulIdx;
14404 // Allow the special case where add (add (mul24, 0), mul24) became ->
14405 // add (mul24, mul24).
14406 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14407 Src2s.push_back(TempNode->getOperand(AddIdx));
14408 auto Src0 =
14409 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14410 if (!Src0)
14411 break;
14412 auto Src1 =
14413 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14414 if (!Src1)
14415 break;
14416 auto IterIsSigned = checkDot4MulSignedness(
14417 TempNode->getOperand(AddIdx), *Src0, *Src1,
14418 TempNode->getOperand(AddIdx)->getOperand(0),
14419 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14420 if (!IterIsSigned)
14421 break;
14422 assert(IsSigned);
14423 if (*IterIsSigned != *IsSigned)
14424 break;
14425 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14426 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14427 ChainLength = I + 2;
14428 break;
14429 }
14430
14431 TempNode = TempNode->getOperand(AddIdx);
14432 Src2s.push_back(TempNode);
14433 ChainLength = I + 1;
14434 if (TempNode->getNumOperands() < 2)
14435 break;
14436 LHS = TempNode->getOperand(0);
14437 RHS = TempNode->getOperand(1);
14438 }
14439
14440 if (ChainLength < 2)
14441 return SDValue();
14442
14443 // Masks were constructed with assumption that we would find a chain of
14444 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14445 // 0x0c) so they do not affect dot calculation.
14446 if (ChainLength < 4) {
14447 fixMasks(Src0s, ChainLength);
14448 fixMasks(Src1s, ChainLength);
14449 }
14450
14451 SDValue Src0, Src1;
14452
14453 // If we are just using a single source for both, and have permuted the
14454 // bytes consistently, we can just use the sources without permuting
14455 // (commutation).
14456 bool UseOriginalSrc = false;
14457 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14458 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14459 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14460 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14461 SmallVector<unsigned, 4> SrcBytes;
14462 auto Src0Mask = Src0s.begin()->PermMask;
14463 SrcBytes.push_back(Src0Mask & 0xFF000000);
14464 bool UniqueEntries = true;
14465 for (auto I = 1; I < 4; I++) {
14466 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14467
14468 if (is_contained(SrcBytes, NextByte)) {
14469 UniqueEntries = false;
14470 break;
14471 }
14472 SrcBytes.push_back(NextByte);
14473 }
14474
14475 if (UniqueEntries) {
14476 UseOriginalSrc = true;
14477
14478 auto *FirstElt = Src0s.begin();
14479 auto FirstEltOp =
14480 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14481
14482 auto *SecondElt = Src1s.begin();
14483 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14484 SecondElt->DWordOffset);
14485
14486 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14487 MVT::getIntegerVT(32));
14488 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14489 MVT::getIntegerVT(32));
14490 }
14491 }
14492
14493 if (!UseOriginalSrc) {
14494 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14495 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14496 }
14497
14498 assert(IsSigned);
14499 SDValue Src2 =
14500 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14501
14502 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14503 : Intrinsic::amdgcn_udot4,
14504 SL, MVT::i64);
14505
14506 assert(!VT.isVector());
14507 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14508 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14509
14510 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14511 }
14512
14513 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14514 return SDValue();
14515
14516 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14517 // add x, sext (setcc) => usubo_carry x, 0, setcc
14518 unsigned Opc = LHS.getOpcode();
14519 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14520 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14521 std::swap(RHS, LHS);
14522
14523 Opc = RHS.getOpcode();
14524 switch (Opc) {
14525 default:
14526 break;
14527 case ISD::ZERO_EXTEND:
14528 case ISD::SIGN_EXTEND:
14529 case ISD::ANY_EXTEND: {
14530 auto Cond = RHS.getOperand(0);
14531 // If this won't be a real VOPC output, we would still need to insert an
14532 // extra instruction anyway.
14533 if (!isBoolSGPR(Cond))
14534 break;
14535 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14536 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14538 return DAG.getNode(Opc, SL, VTList, Args);
14539 }
14540 case ISD::UADDO_CARRY: {
14541 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14542 if (!isNullConstant(RHS.getOperand(1)))
14543 break;
14544 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14545 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14546 }
14547 }
14548 return SDValue();
14549}
14550
14551SDValue SITargetLowering::performSubCombine(SDNode *N,
14552 DAGCombinerInfo &DCI) const {
14553 SelectionDAG &DAG = DCI.DAG;
14554 EVT VT = N->getValueType(0);
14555
14556 if (VT == MVT::i64) {
14557 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14558 return Folded;
14559 }
14560
14561 if (VT != MVT::i32)
14562 return SDValue();
14563
14564 SDLoc SL(N);
14565 SDValue LHS = N->getOperand(0);
14566 SDValue RHS = N->getOperand(1);
14567
14568 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14569 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14570 unsigned Opc = RHS.getOpcode();
14571 switch (Opc) {
14572 default:
14573 break;
14574 case ISD::ZERO_EXTEND:
14575 case ISD::SIGN_EXTEND:
14576 case ISD::ANY_EXTEND: {
14577 auto Cond = RHS.getOperand(0);
14578 // If this won't be a real VOPC output, we would still need to insert an
14579 // extra instruction anyway.
14580 if (!isBoolSGPR(Cond))
14581 break;
14582 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14583 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14585 return DAG.getNode(Opc, SL, VTList, Args);
14586 }
14587 }
14588
14589 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14590 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14591 if (!isNullConstant(LHS.getOperand(1)))
14592 return SDValue();
14593 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14594 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14595 }
14596 return SDValue();
14597}
14598
14599SDValue
14600SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14601 DAGCombinerInfo &DCI) const {
14602
14603 if (N->getValueType(0) != MVT::i32)
14604 return SDValue();
14605
14606 if (!isNullConstant(N->getOperand(1)))
14607 return SDValue();
14608
14609 SelectionDAG &DAG = DCI.DAG;
14610 SDValue LHS = N->getOperand(0);
14611
14612 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14613 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14614 unsigned LHSOpc = LHS.getOpcode();
14615 unsigned Opc = N->getOpcode();
14616 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14617 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14618 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14619 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14620 }
14621 return SDValue();
14622}
14623
14624SDValue SITargetLowering::performFAddCombine(SDNode *N,
14625 DAGCombinerInfo &DCI) const {
14626 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14627 return SDValue();
14628
14629 SelectionDAG &DAG = DCI.DAG;
14630 EVT VT = N->getValueType(0);
14631
14632 SDLoc SL(N);
14633 SDValue LHS = N->getOperand(0);
14634 SDValue RHS = N->getOperand(1);
14635
14636 // These should really be instruction patterns, but writing patterns with
14637 // source modifiers is a pain.
14638
14639 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14640 if (LHS.getOpcode() == ISD::FADD) {
14641 SDValue A = LHS.getOperand(0);
14642 if (A == LHS.getOperand(1)) {
14643 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14644 if (FusedOp != 0) {
14645 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14646 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14647 }
14648 }
14649 }
14650
14651 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14652 if (RHS.getOpcode() == ISD::FADD) {
14653 SDValue A = RHS.getOperand(0);
14654 if (A == RHS.getOperand(1)) {
14655 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14656 if (FusedOp != 0) {
14657 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14658 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14659 }
14660 }
14661 }
14662
14663 return SDValue();
14664}
14665
14666SDValue SITargetLowering::performFSubCombine(SDNode *N,
14667 DAGCombinerInfo &DCI) const {
14668 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14669 return SDValue();
14670
14671 SelectionDAG &DAG = DCI.DAG;
14672 SDLoc SL(N);
14673 EVT VT = N->getValueType(0);
14674 assert(!VT.isVector());
14675
14676 // Try to get the fneg to fold into the source modifier. This undoes generic
14677 // DAG combines and folds them into the mad.
14678 //
14679 // Only do this if we are not trying to support denormals. v_mad_f32 does
14680 // not support denormals ever.
14681 SDValue LHS = N->getOperand(0);
14682 SDValue RHS = N->getOperand(1);
14683 if (LHS.getOpcode() == ISD::FADD) {
14684 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14685 SDValue A = LHS.getOperand(0);
14686 if (A == LHS.getOperand(1)) {
14687 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14688 if (FusedOp != 0) {
14689 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14690 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14691
14692 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14693 }
14694 }
14695 }
14696
14697 if (RHS.getOpcode() == ISD::FADD) {
14698 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14699
14700 SDValue A = RHS.getOperand(0);
14701 if (A == RHS.getOperand(1)) {
14702 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14703 if (FusedOp != 0) {
14704 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14705 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14706 }
14707 }
14708 }
14709
14710 return SDValue();
14711}
14712
14713SDValue SITargetLowering::performFDivCombine(SDNode *N,
14714 DAGCombinerInfo &DCI) const {
14715 SelectionDAG &DAG = DCI.DAG;
14716 SDLoc SL(N);
14717 EVT VT = N->getValueType(0);
14718 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14719 return SDValue();
14720
14721 SDValue LHS = N->getOperand(0);
14722 SDValue RHS = N->getOperand(1);
14723
14724 SDNodeFlags Flags = N->getFlags();
14725 SDNodeFlags RHSFlags = RHS->getFlags();
14726 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14727 !RHS->hasOneUse())
14728 return SDValue();
14729
14730 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14731 bool IsNegative = false;
14732 if (CLHS->isExactlyValue(1.0) ||
14733 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14734 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14735 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14736 if (RHS.getOpcode() == ISD::FSQRT) {
14737 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14738 SDValue Rsq =
14739 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14740 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14741 }
14742 }
14743 }
14744
14745 return SDValue();
14746}
14747
14748SDValue SITargetLowering::performFMulCombine(SDNode *N,
14749 DAGCombinerInfo &DCI) const {
14750 SelectionDAG &DAG = DCI.DAG;
14751 EVT VT = N->getValueType(0);
14752 EVT ScalarVT = VT.getScalarType();
14753 EVT IntVT = VT.changeElementType(MVT::i32);
14754
14755 SDValue LHS = N->getOperand(0);
14756 SDValue RHS = N->getOperand(1);
14757
14758 // It is cheaper to realize i32 inline constants as compared against
14759 // materializing f16 or f64 (or even non-inline f32) values,
14760 // possible via ldexp usage, as shown below :
14761 //
14762 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14763 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14764 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14765 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14766 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14767 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14768 if (!TrueNode)
14769 return SDValue();
14770 const ConstantFPSDNode *FalseNode =
14771 isConstOrConstSplatFP(RHS.getOperand(2));
14772 if (!FalseNode)
14773 return SDValue();
14774
14775 if (TrueNode->isNegative() != FalseNode->isNegative())
14776 return SDValue();
14777
14778 // For f32, only non-inline constants should be transformed.
14780 if (ScalarVT == MVT::f32 &&
14781 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14782 TII->isInlineConstant(FalseNode->getValueAPF()))
14783 return SDValue();
14784
14785 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14786 if (TrueNodeExpVal == INT_MIN)
14787 return SDValue();
14788 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14789 if (FalseNodeExpVal == INT_MIN)
14790 return SDValue();
14791
14792 SDLoc SL(N);
14793 SDValue SelectNode =
14794 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14795 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14796 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14797
14798 LHS = TrueNode->isNegative()
14799 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14800 : LHS;
14801
14802 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14803 }
14804
14805 return SDValue();
14806}
14807
14808SDValue SITargetLowering::performFMACombine(SDNode *N,
14809 DAGCombinerInfo &DCI) const {
14810 SelectionDAG &DAG = DCI.DAG;
14811 EVT VT = N->getValueType(0);
14812 SDLoc SL(N);
14813
14814 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14815 return SDValue();
14816
14817 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14818 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14819 SDValue Op1 = N->getOperand(0);
14820 SDValue Op2 = N->getOperand(1);
14821 SDValue FMA = N->getOperand(2);
14822
14823 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14824 Op2.getOpcode() != ISD::FP_EXTEND)
14825 return SDValue();
14826
14827 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14828 // regardless of the denorm mode setting. Therefore,
14829 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14830 const TargetOptions &Options = DAG.getTarget().Options;
14831 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14832 (N->getFlags().hasAllowContract() &&
14833 FMA->getFlags().hasAllowContract())) {
14834 Op1 = Op1.getOperand(0);
14835 Op2 = Op2.getOperand(0);
14836 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14838 return SDValue();
14839
14840 SDValue Vec1 = Op1.getOperand(0);
14841 SDValue Idx1 = Op1.getOperand(1);
14842 SDValue Vec2 = Op2.getOperand(0);
14843
14844 SDValue FMAOp1 = FMA.getOperand(0);
14845 SDValue FMAOp2 = FMA.getOperand(1);
14846 SDValue FMAAcc = FMA.getOperand(2);
14847
14848 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14849 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14850 return SDValue();
14851
14852 FMAOp1 = FMAOp1.getOperand(0);
14853 FMAOp2 = FMAOp2.getOperand(0);
14854 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14856 return SDValue();
14857
14858 SDValue Vec3 = FMAOp1.getOperand(0);
14859 SDValue Vec4 = FMAOp2.getOperand(0);
14860 SDValue Idx2 = FMAOp1.getOperand(1);
14861
14862 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14863 // Idx1 and Idx2 cannot be the same.
14864 Idx1 == Idx2)
14865 return SDValue();
14866
14867 if (Vec1 == Vec2 || Vec3 == Vec4)
14868 return SDValue();
14869
14870 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14871 return SDValue();
14872
14873 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14874 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14875 DAG.getTargetConstant(0, SL, MVT::i1));
14876 }
14877 }
14878 return SDValue();
14879}
14880
14881SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14882 DAGCombinerInfo &DCI) const {
14883 SelectionDAG &DAG = DCI.DAG;
14884 SDLoc SL(N);
14885
14886 SDValue LHS = N->getOperand(0);
14887 SDValue RHS = N->getOperand(1);
14888 EVT VT = LHS.getValueType();
14889 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14890
14891 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14892 if (!CRHS) {
14893 CRHS = dyn_cast<ConstantSDNode>(LHS);
14894 if (CRHS) {
14895 std::swap(LHS, RHS);
14897 }
14898 }
14899
14900 if (CRHS) {
14901 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14902 isBoolSGPR(LHS.getOperand(0))) {
14903 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14904 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14905 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14906 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14907 if ((CRHS->isAllOnes() &&
14908 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14909 (CRHS->isZero() &&
14910 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14911 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14912 DAG.getAllOnesConstant(SL, MVT::i1));
14913 if ((CRHS->isAllOnes() &&
14914 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14915 (CRHS->isZero() &&
14916 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14917 return LHS.getOperand(0);
14918 }
14919
14920 const APInt &CRHSVal = CRHS->getAPIntValue();
14921 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14922 LHS.getOpcode() == ISD::SELECT &&
14923 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14924 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14925 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14926 isBoolSGPR(LHS.getOperand(0))) {
14927 // Given CT != FT:
14928 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14929 // setcc (select cc, CT, CF), CF, ne => cc
14930 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14931 // setcc (select cc, CT, CF), CT, eq => cc
14932 const APInt &CT = LHS.getConstantOperandAPInt(1);
14933 const APInt &CF = LHS.getConstantOperandAPInt(2);
14934
14935 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14936 (CT == CRHSVal && CC == ISD::SETNE))
14937 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14938 DAG.getAllOnesConstant(SL, MVT::i1));
14939 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14940 (CT == CRHSVal && CC == ISD::SETEQ))
14941 return LHS.getOperand(0);
14942 }
14943 }
14944
14945 if (VT != MVT::f32 && VT != MVT::f64 &&
14946 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14947 return SDValue();
14948
14949 // Match isinf/isfinite pattern
14950 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14951 // (fcmp one (fabs x), inf) -> (fp_class x,
14952 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14953 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
14954 LHS.getOpcode() == ISD::FABS) {
14955 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14956 if (!CRHS)
14957 return SDValue();
14958
14959 const APFloat &APF = CRHS->getValueAPF();
14960 if (APF.isInfinity() && !APF.isNegative()) {
14961 const unsigned IsInfMask =
14963 const unsigned IsFiniteMask =
14967 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14968 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14969 DAG.getConstant(Mask, SL, MVT::i32));
14970 }
14971 }
14972
14973 return SDValue();
14974}
14975
14976SDValue
14977SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14978 DAGCombinerInfo &DCI) const {
14979 SelectionDAG &DAG = DCI.DAG;
14980 SDLoc SL(N);
14981 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14982
14983 SDValue Src = N->getOperand(0);
14984 SDValue Shift = N->getOperand(0);
14985
14986 // TODO: Extend type shouldn't matter (assuming legal types).
14987 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14988 Shift = Shift.getOperand(0);
14989
14990 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14991 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14992 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14993 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14994 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14995 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14996 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14997 SDValue Shifted = DAG.getZExtOrTrunc(
14998 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
14999
15000 unsigned ShiftOffset = 8 * Offset;
15001 if (Shift.getOpcode() == ISD::SHL)
15002 ShiftOffset -= C->getZExtValue();
15003 else
15004 ShiftOffset += C->getZExtValue();
15005
15006 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15007 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15008 MVT::f32, Shifted);
15009 }
15010 }
15011 }
15012
15013 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15014 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15015 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15016 // We simplified Src. If this node is not dead, visit it again so it is
15017 // folded properly.
15018 if (N->getOpcode() != ISD::DELETED_NODE)
15019 DCI.AddToWorklist(N);
15020 return SDValue(N, 0);
15021 }
15022
15023 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15024 if (SDValue DemandedSrc =
15026 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15027
15028 return SDValue();
15029}
15030
15031SDValue SITargetLowering::performClampCombine(SDNode *N,
15032 DAGCombinerInfo &DCI) const {
15033 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15034 if (!CSrc)
15035 return SDValue();
15036
15037 const MachineFunction &MF = DCI.DAG.getMachineFunction();
15038 const APFloat &F = CSrc->getValueAPF();
15039 APFloat Zero = APFloat::getZero(F.getSemantics());
15040 if (F < Zero ||
15041 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15042 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15043 }
15044
15045 APFloat One(F.getSemantics(), "1.0");
15046 if (F > One)
15047 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15048
15049 return SDValue(CSrc, 0);
15050}
15051
15053 DAGCombinerInfo &DCI) const {
15054 switch (N->getOpcode()) {
15055 case ISD::ADD:
15056 case ISD::SUB:
15057 case ISD::SHL:
15058 case ISD::SRL:
15059 case ISD::SRA:
15060 case ISD::AND:
15061 case ISD::OR:
15062 case ISD::XOR:
15063 case ISD::MUL:
15064 case ISD::SETCC:
15065 case ISD::SELECT:
15066 case ISD::SMIN:
15067 case ISD::SMAX:
15068 case ISD::UMIN:
15069 case ISD::UMAX:
15070 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15071 return Res;
15072 break;
15073 default:
15074 break;
15075 }
15076
15077 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15078 return SDValue();
15079
15080 switch (N->getOpcode()) {
15081 case ISD::ADD:
15082 return performAddCombine(N, DCI);
15083 case ISD::SUB:
15084 return performSubCombine(N, DCI);
15085 case ISD::UADDO_CARRY:
15086 case ISD::USUBO_CARRY:
15087 return performAddCarrySubCarryCombine(N, DCI);
15088 case ISD::FADD:
15089 return performFAddCombine(N, DCI);
15090 case ISD::FSUB:
15091 return performFSubCombine(N, DCI);
15092 case ISD::FDIV:
15093 return performFDivCombine(N, DCI);
15094 case ISD::FMUL:
15095 return performFMulCombine(N, DCI);
15096 case ISD::SETCC:
15097 return performSetCCCombine(N, DCI);
15098 case ISD::FMAXNUM:
15099 case ISD::FMINNUM:
15100 case ISD::FMAXNUM_IEEE:
15101 case ISD::FMINNUM_IEEE:
15102 case ISD::FMAXIMUM:
15103 case ISD::FMINIMUM:
15104 case ISD::SMAX:
15105 case ISD::SMIN:
15106 case ISD::UMAX:
15107 case ISD::UMIN:
15110 return performMinMaxCombine(N, DCI);
15111 case ISD::FMA:
15112 return performFMACombine(N, DCI);
15113 case ISD::AND:
15114 return performAndCombine(N, DCI);
15115 case ISD::OR:
15116 return performOrCombine(N, DCI);
15117 case ISD::FSHR: {
15119 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15120 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15121 return matchPERM(N, DCI);
15122 }
15123 break;
15124 }
15125 case ISD::XOR:
15126 return performXorCombine(N, DCI);
15127 case ISD::ZERO_EXTEND:
15128 return performZeroExtendCombine(N, DCI);
15130 return performSignExtendInRegCombine(N, DCI);
15132 return performClassCombine(N, DCI);
15133 case ISD::FCANONICALIZE:
15134 return performFCanonicalizeCombine(N, DCI);
15135 case AMDGPUISD::RCP:
15136 return performRcpCombine(N, DCI);
15137 case ISD::FLDEXP:
15138 case AMDGPUISD::FRACT:
15139 case AMDGPUISD::RSQ:
15142 case AMDGPUISD::RSQ_CLAMP: {
15143 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15144 SDValue Src = N->getOperand(0);
15145 if (Src.isUndef())
15146 return Src;
15147 break;
15148 }
15149 case ISD::SINT_TO_FP:
15150 case ISD::UINT_TO_FP:
15151 return performUCharToFloatCombine(N, DCI);
15152 case ISD::FCOPYSIGN:
15153 return performFCopySignCombine(N, DCI);
15158 return performCvtF32UByteNCombine(N, DCI);
15159 case AMDGPUISD::FMED3:
15160 return performFMed3Combine(N, DCI);
15162 return performCvtPkRTZCombine(N, DCI);
15163 case AMDGPUISD::CLAMP:
15164 return performClampCombine(N, DCI);
15165 case ISD::SCALAR_TO_VECTOR: {
15166 SelectionDAG &DAG = DCI.DAG;
15167 EVT VT = N->getValueType(0);
15168
15169 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15170 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15171 SDLoc SL(N);
15172 SDValue Src = N->getOperand(0);
15173 EVT EltVT = Src.getValueType();
15174 if (EltVT != MVT::i16)
15175 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15176
15177 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15178 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15179 }
15180
15181 break;
15182 }
15184 return performExtractVectorEltCombine(N, DCI);
15186 return performInsertVectorEltCombine(N, DCI);
15187 case ISD::FP_ROUND:
15188 return performFPRoundCombine(N, DCI);
15189 case ISD::LOAD: {
15190 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15191 return Widened;
15192 [[fallthrough]];
15193 }
15194 default: {
15195 if (!DCI.isBeforeLegalize()) {
15196 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15197 return performMemSDNodeCombine(MemNode, DCI);
15198 }
15199
15200 break;
15201 }
15202 }
15203
15205}
15206
15207/// Helper function for adjustWritemask
15208static unsigned SubIdx2Lane(unsigned Idx) {
15209 switch (Idx) {
15210 default:
15211 return ~0u;
15212 case AMDGPU::sub0:
15213 return 0;
15214 case AMDGPU::sub1:
15215 return 1;
15216 case AMDGPU::sub2:
15217 return 2;
15218 case AMDGPU::sub3:
15219 return 3;
15220 case AMDGPU::sub4:
15221 return 4; // Possible with TFE/LWE
15222 }
15223}
15224
15225/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15226SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15227 SelectionDAG &DAG) const {
15228 unsigned Opcode = Node->getMachineOpcode();
15229
15230 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15231 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15232 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15233 return Node; // not implemented for D16
15234
15235 SDNode *Users[5] = {nullptr};
15236 unsigned Lane = 0;
15237 unsigned DmaskIdx =
15238 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15239 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15240 unsigned NewDmask = 0;
15241 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15242 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15243 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15244 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15245 ? true
15246 : false;
15247 unsigned TFCLane = 0;
15248 bool HasChain = Node->getNumValues() > 1;
15249
15250 if (OldDmask == 0) {
15251 // These are folded out, but on the chance it happens don't assert.
15252 return Node;
15253 }
15254
15255 unsigned OldBitsSet = llvm::popcount(OldDmask);
15256 // Work out which is the TFE/LWE lane if that is enabled.
15257 if (UsesTFC) {
15258 TFCLane = OldBitsSet;
15259 }
15260
15261 // Try to figure out the used register components
15262 for (SDUse &Use : Node->uses()) {
15263
15264 // Don't look at users of the chain.
15265 if (Use.getResNo() != 0)
15266 continue;
15267
15268 SDNode *User = Use.getUser();
15269
15270 // Abort if we can't understand the usage
15271 if (!User->isMachineOpcode() ||
15272 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15273 return Node;
15274
15275 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15276 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15277 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15278 // set, etc.
15279 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15280 if (Lane == ~0u)
15281 return Node;
15282
15283 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15284 if (UsesTFC && Lane == TFCLane) {
15285 Users[Lane] = User;
15286 } else {
15287 // Set which texture component corresponds to the lane.
15288 unsigned Comp;
15289 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15290 Comp = llvm::countr_zero(Dmask);
15291 Dmask &= ~(1 << Comp);
15292 }
15293
15294 // Abort if we have more than one user per component.
15295 if (Users[Lane])
15296 return Node;
15297
15298 Users[Lane] = User;
15299 NewDmask |= 1 << Comp;
15300 }
15301 }
15302
15303 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15304 bool NoChannels = !NewDmask;
15305 if (NoChannels) {
15306 if (!UsesTFC) {
15307 // No uses of the result and not using TFC. Then do nothing.
15308 return Node;
15309 }
15310 // If the original dmask has one channel - then nothing to do
15311 if (OldBitsSet == 1)
15312 return Node;
15313 // Use an arbitrary dmask - required for the instruction to work
15314 NewDmask = 1;
15315 }
15316 // Abort if there's no change
15317 if (NewDmask == OldDmask)
15318 return Node;
15319
15320 unsigned BitsSet = llvm::popcount(NewDmask);
15321
15322 // Check for TFE or LWE - increase the number of channels by one to account
15323 // for the extra return value
15324 // This will need adjustment for D16 if this is also included in
15325 // adjustWriteMask (this function) but at present D16 are excluded.
15326 unsigned NewChannels = BitsSet + UsesTFC;
15327
15328 int NewOpcode =
15329 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15330 assert(NewOpcode != -1 &&
15331 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15332 "failed to find equivalent MIMG op");
15333
15334 // Adjust the writemask in the node
15336 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15337 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15338 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15339
15340 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15341
15342 MVT ResultVT = NewChannels == 1
15343 ? SVT
15344 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15345 : NewChannels == 5 ? 8
15346 : NewChannels);
15347 SDVTList NewVTList =
15348 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15349
15350 MachineSDNode *NewNode =
15351 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15352
15353 if (HasChain) {
15354 // Update chain.
15355 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15356 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15357 }
15358
15359 if (NewChannels == 1) {
15360 assert(Node->hasNUsesOfValue(1, 0));
15361 SDNode *Copy =
15362 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15363 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15364 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15365 return nullptr;
15366 }
15367
15368 // Update the users of the node with the new indices
15369 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15370 SDNode *User = Users[i];
15371 if (!User) {
15372 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15373 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15374 if (i || !NoChannels)
15375 continue;
15376 } else {
15377 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15378 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15379 if (NewUser != User) {
15380 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15381 DAG.RemoveDeadNode(User);
15382 }
15383 }
15384
15385 switch (Idx) {
15386 default:
15387 break;
15388 case AMDGPU::sub0:
15389 Idx = AMDGPU::sub1;
15390 break;
15391 case AMDGPU::sub1:
15392 Idx = AMDGPU::sub2;
15393 break;
15394 case AMDGPU::sub2:
15395 Idx = AMDGPU::sub3;
15396 break;
15397 case AMDGPU::sub3:
15398 Idx = AMDGPU::sub4;
15399 break;
15400 }
15401 }
15402
15403 DAG.RemoveDeadNode(Node);
15404 return nullptr;
15405}
15406
15408 if (Op.getOpcode() == ISD::AssertZext)
15409 Op = Op.getOperand(0);
15410
15411 return isa<FrameIndexSDNode>(Op);
15412}
15413
15414/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15415/// with frame index operands.
15416/// LLVM assumes that inputs are to these instructions are registers.
15417SDNode *
15419 SelectionDAG &DAG) const {
15420 if (Node->getOpcode() == ISD::CopyToReg) {
15421 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15422 SDValue SrcVal = Node->getOperand(2);
15423
15424 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15425 // to try understanding copies to physical registers.
15426 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15427 SDLoc SL(Node);
15429 SDValue VReg = DAG.getRegister(
15430 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15431
15432 SDNode *Glued = Node->getGluedNode();
15433 SDValue ToVReg = DAG.getCopyToReg(
15434 Node->getOperand(0), SL, VReg, SrcVal,
15435 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15436 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15437 VReg, ToVReg.getValue(1));
15438 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15439 DAG.RemoveDeadNode(Node);
15440 return ToResultReg.getNode();
15441 }
15442 }
15443
15445 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15446 if (!isFrameIndexOp(Node->getOperand(i))) {
15447 Ops.push_back(Node->getOperand(i));
15448 continue;
15449 }
15450
15451 SDLoc DL(Node);
15452 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15453 Node->getOperand(i).getValueType(),
15454 Node->getOperand(i)),
15455 0));
15456 }
15457
15458 return DAG.UpdateNodeOperands(Node, Ops);
15459}
15460
15461/// Fold the instructions after selecting them.
15462/// Returns null if users were already updated.
15464 SelectionDAG &DAG) const {
15466 unsigned Opcode = Node->getMachineOpcode();
15467
15468 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15469 !TII->isGather4(Opcode) &&
15470 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15471 return adjustWritemask(Node, DAG);
15472 }
15473
15474 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15476 return Node;
15477 }
15478
15479 switch (Opcode) {
15480 case AMDGPU::V_DIV_SCALE_F32_e64:
15481 case AMDGPU::V_DIV_SCALE_F64_e64: {
15482 // Satisfy the operand register constraint when one of the inputs is
15483 // undefined. Ordinarily each undef value will have its own implicit_def of
15484 // a vreg, so force these to use a single register.
15485 SDValue Src0 = Node->getOperand(1);
15486 SDValue Src1 = Node->getOperand(3);
15487 SDValue Src2 = Node->getOperand(5);
15488
15489 if ((Src0.isMachineOpcode() &&
15490 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15491 (Src0 == Src1 || Src0 == Src2))
15492 break;
15493
15494 MVT VT = Src0.getValueType().getSimpleVT();
15495 const TargetRegisterClass *RC =
15496 getRegClassFor(VT, Src0.getNode()->isDivergent());
15497
15499 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15500
15501 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15502 Src0, SDValue());
15503
15504 // src0 must be the same register as src1 or src2, even if the value is
15505 // undefined, so make sure we don't violate this constraint.
15506 if (Src0.isMachineOpcode() &&
15507 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15508 if (Src1.isMachineOpcode() &&
15509 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15510 Src0 = Src1;
15511 else if (Src2.isMachineOpcode() &&
15512 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15513 Src0 = Src2;
15514 else {
15515 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15516 Src0 = UndefReg;
15517 Src1 = UndefReg;
15518 }
15519 } else
15520 break;
15521
15522 SmallVector<SDValue, 9> Ops(Node->ops());
15523 Ops[1] = Src0;
15524 Ops[3] = Src1;
15525 Ops[5] = Src2;
15526 Ops.push_back(ImpDef.getValue(1));
15527 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15528 }
15529 default:
15530 break;
15531 }
15532
15533 return Node;
15534}
15535
15536// Any MIMG instructions that use tfe or lwe require an initialization of the
15537// result register that will be written in the case of a memory access failure.
15538// The required code is also added to tie this init code to the result of the
15539// img instruction.
15542 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15543 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15544 MachineBasicBlock &MBB = *MI.getParent();
15545
15546 int DstIdx =
15547 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15548 unsigned InitIdx = 0;
15549
15550 if (TII->isImage(MI)) {
15551 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15552 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15553 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15554
15555 if (!TFE && !LWE) // intersect_ray
15556 return;
15557
15558 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15559 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15560 unsigned D16Val = D16 ? D16->getImm() : 0;
15561
15562 if (!TFEVal && !LWEVal)
15563 return;
15564
15565 // At least one of TFE or LWE are non-zero
15566 // We have to insert a suitable initialization of the result value and
15567 // tie this to the dest of the image instruction.
15568
15569 // Calculate which dword we have to initialize to 0.
15570 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15571
15572 // check that dmask operand is found.
15573 assert(MO_Dmask && "Expected dmask operand in instruction");
15574
15575 unsigned dmask = MO_Dmask->getImm();
15576 // Determine the number of active lanes taking into account the
15577 // Gather4 special case
15578 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15579
15580 bool Packed = !Subtarget->hasUnpackedD16VMem();
15581
15582 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15583
15584 // Abandon attempt if the dst size isn't large enough
15585 // - this is in fact an error but this is picked up elsewhere and
15586 // reported correctly.
15587 uint32_t DstSize =
15588 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15589 if (DstSize < InitIdx)
15590 return;
15591 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15592 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15593 } else {
15594 return;
15595 }
15596
15597 const DebugLoc &DL = MI.getDebugLoc();
15598
15599 // Create a register for the initialization value.
15600 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15601 unsigned NewDst = 0; // Final initialized value will be in here
15602
15603 // If PRTStrictNull feature is enabled (the default) then initialize
15604 // all the result registers to 0, otherwise just the error indication
15605 // register (VGPRn+1)
15606 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15607 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15608
15609 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15610 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15611 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15612 // Initialize dword
15613 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15614 // clang-format off
15615 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15616 .addImm(0);
15617 // clang-format on
15618 // Insert into the super-reg
15619 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15620 .addReg(PrevDst)
15621 .addReg(SubReg)
15623
15624 PrevDst = NewDst;
15625 }
15626
15627 // Add as an implicit operand
15628 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15629
15630 // Tie the just added implicit operand to the dst
15631 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15632}
15633
15634/// Assign the register class depending on the number of
15635/// bits set in the writemask
15637 SDNode *Node) const {
15639
15640 MachineFunction *MF = MI.getParent()->getParent();
15643
15644 if (TII->isVOP3(MI.getOpcode())) {
15645 // Make sure constant bus requirements are respected.
15646 TII->legalizeOperandsVOP3(MRI, MI);
15647
15648 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15649 // This saves a chain-copy of registers and better balance register
15650 // use between vgpr and agpr as agpr tuples tend to be big.
15651 if (!MI.getDesc().operands().empty()) {
15652 unsigned Opc = MI.getOpcode();
15653 bool HasAGPRs = Info->mayNeedAGPRs();
15654 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15655 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15656 for (auto I :
15657 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15658 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15659 if (I == -1)
15660 break;
15661 if ((I == Src2Idx) && (HasAGPRs))
15662 break;
15663 MachineOperand &Op = MI.getOperand(I);
15664 if (!Op.isReg() || !Op.getReg().isVirtual())
15665 continue;
15666 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15667 if (!TRI->hasAGPRs(RC))
15668 continue;
15669 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15670 if (!Src || !Src->isCopy() ||
15671 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15672 continue;
15673 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15674 // All uses of agpr64 and agpr32 can also accept vgpr except for
15675 // v_accvgpr_read, but we do not produce agpr reads during selection,
15676 // so no use checks are needed.
15677 MRI.setRegClass(Op.getReg(), NewRC);
15678 }
15679
15680 if (TII->isMAI(MI)) {
15681 // The ordinary src0, src1, src2 were legalized above.
15682 //
15683 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15684 // as a separate instruction.
15685 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15686 AMDGPU::OpName::scale_src0);
15687 if (Src0Idx != -1) {
15688 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15689 AMDGPU::OpName::scale_src1);
15690 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15691 TII->usesConstantBus(MRI, MI, Src1Idx))
15692 TII->legalizeOpWithMove(MI, Src1Idx);
15693 }
15694 }
15695
15696 if (!HasAGPRs)
15697 return;
15698
15699 // Resolve the rest of AV operands to AGPRs.
15700 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15701 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15702 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15703 if (TRI->isVectorSuperClass(RC)) {
15704 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15705 MRI.setRegClass(Src2->getReg(), NewRC);
15706 if (Src2->isTied())
15707 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15708 }
15709 }
15710 }
15711 }
15712
15713 return;
15714 }
15715
15716 if (TII->isImage(MI))
15717 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15718}
15719
15721 uint64_t Val) {
15722 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15723 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15724}
15725
15727 const SDLoc &DL,
15728 SDValue Ptr) const {
15730
15731 // Build the half of the subregister with the constants before building the
15732 // full 128-bit register. If we are building multiple resource descriptors,
15733 // this will allow CSEing of the 2-component register.
15734 const SDValue Ops0[] = {
15735 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15736 buildSMovImm32(DAG, DL, 0),
15737 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15738 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15739 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15740
15741 SDValue SubRegHi = SDValue(
15742 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15743
15744 // Combine the constants and the pointer.
15745 const SDValue Ops1[] = {
15746 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15747 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15748 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15749
15750 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15751}
15752
15753/// Return a resource descriptor with the 'Add TID' bit enabled
15754/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15755/// of the resource descriptor) to create an offset, which is added to
15756/// the resource pointer.
15758 SDValue Ptr, uint32_t RsrcDword1,
15759 uint64_t RsrcDword2And3) const {
15760 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15761 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15762 if (RsrcDword1) {
15763 PtrHi =
15764 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15765 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15766 0);
15767 }
15768
15769 SDValue DataLo =
15770 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15771 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15772
15773 const SDValue Ops[] = {
15774 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15775 PtrLo,
15776 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15777 PtrHi,
15778 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15779 DataLo,
15780 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15781 DataHi,
15782 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15783
15784 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15785}
15786
15787//===----------------------------------------------------------------------===//
15788// SI Inline Assembly Support
15789//===----------------------------------------------------------------------===//
15790
15791std::pair<unsigned, const TargetRegisterClass *>
15793 StringRef Constraint,
15794 MVT VT) const {
15795 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15796
15797 const TargetRegisterClass *RC = nullptr;
15798 if (Constraint.size() == 1) {
15799 const unsigned BitWidth = VT.getSizeInBits();
15800 switch (Constraint[0]) {
15801 default:
15802 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15803 case 's':
15804 case 'r':
15805 switch (BitWidth) {
15806 case 16:
15807 RC = &AMDGPU::SReg_32RegClass;
15808 break;
15809 case 64:
15810 RC = &AMDGPU::SGPR_64RegClass;
15811 break;
15812 default:
15814 if (!RC)
15815 return std::pair(0U, nullptr);
15816 break;
15817 }
15818 break;
15819 case 'v':
15820 switch (BitWidth) {
15821 case 16:
15822 RC = &AMDGPU::VGPR_32RegClass;
15823 break;
15824 default:
15825 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15826 if (!RC)
15827 return std::pair(0U, nullptr);
15828 break;
15829 }
15830 break;
15831 case 'a':
15832 if (!Subtarget->hasMAIInsts())
15833 break;
15834 switch (BitWidth) {
15835 case 16:
15836 RC = &AMDGPU::AGPR_32RegClass;
15837 break;
15838 default:
15839 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15840 if (!RC)
15841 return std::pair(0U, nullptr);
15842 break;
15843 }
15844 break;
15845 }
15846 // We actually support i128, i16 and f16 as inline parameters
15847 // even if they are not reported as legal
15848 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15849 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15850 return std::pair(0U, RC);
15851 }
15852
15853 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15854 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15855 if (RegName.consume_front("v")) {
15856 RC = &AMDGPU::VGPR_32RegClass;
15857 } else if (RegName.consume_front("s")) {
15858 RC = &AMDGPU::SGPR_32RegClass;
15859 } else if (RegName.consume_front("a")) {
15860 RC = &AMDGPU::AGPR_32RegClass;
15861 }
15862
15863 if (RC) {
15864 uint32_t Idx;
15865 if (RegName.consume_front("[")) {
15866 uint32_t End;
15867 bool Failed = RegName.consumeInteger(10, Idx);
15868 Failed |= !RegName.consume_front(":");
15869 Failed |= RegName.consumeInteger(10, End);
15870 Failed |= !RegName.consume_back("]");
15871 if (!Failed) {
15872 uint32_t Width = (End - Idx + 1) * 32;
15873 // Prohibit constraints for register ranges with a width that does not
15874 // match the required type.
15875 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15876 return std::pair(0U, nullptr);
15877 MCRegister Reg = RC->getRegister(Idx);
15879 RC = TRI->getVGPRClassForBitWidth(Width);
15880 else if (SIRegisterInfo::isSGPRClass(RC))
15881 RC = TRI->getSGPRClassForBitWidth(Width);
15882 else if (SIRegisterInfo::isAGPRClass(RC))
15883 RC = TRI->getAGPRClassForBitWidth(Width);
15884 if (RC) {
15885 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15886 if (!Reg) {
15887 // The register class does not contain the requested register,
15888 // e.g., because it is an SGPR pair that would violate alignment
15889 // requirements.
15890 return std::pair(0U, nullptr);
15891 }
15892 return std::pair(Reg, RC);
15893 }
15894 }
15895 } else {
15896 // Check for lossy scalar/vector conversions.
15897 if (VT.isVector() && VT.getSizeInBits() != 32)
15898 return std::pair(0U, nullptr);
15899 bool Failed = RegName.getAsInteger(10, Idx);
15900 if (!Failed && Idx < RC->getNumRegs())
15901 return std::pair(RC->getRegister(Idx), RC);
15902 }
15903 }
15904 }
15905
15906 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15907 if (Ret.first)
15908 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15909
15910 return Ret;
15911}
15912
15913static bool isImmConstraint(StringRef Constraint) {
15914 if (Constraint.size() == 1) {
15915 switch (Constraint[0]) {
15916 default:
15917 break;
15918 case 'I':
15919 case 'J':
15920 case 'A':
15921 case 'B':
15922 case 'C':
15923 return true;
15924 }
15925 } else if (Constraint == "DA" || Constraint == "DB") {
15926 return true;
15927 }
15928 return false;
15929}
15930
15933 if (Constraint.size() == 1) {
15934 switch (Constraint[0]) {
15935 default:
15936 break;
15937 case 's':
15938 case 'v':
15939 case 'a':
15940 return C_RegisterClass;
15941 }
15942 }
15943 if (isImmConstraint(Constraint)) {
15944 return C_Other;
15945 }
15946 return TargetLowering::getConstraintType(Constraint);
15947}
15948
15949static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15951 Val = Val & maskTrailingOnes<uint64_t>(Size);
15952 }
15953 return Val;
15954}
15955
15957 StringRef Constraint,
15958 std::vector<SDValue> &Ops,
15959 SelectionDAG &DAG) const {
15960 if (isImmConstraint(Constraint)) {
15961 uint64_t Val;
15962 if (getAsmOperandConstVal(Op, Val) &&
15963 checkAsmConstraintVal(Op, Constraint, Val)) {
15964 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15965 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15966 }
15967 } else {
15968 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15969 }
15970}
15971
15973 unsigned Size = Op.getScalarValueSizeInBits();
15974 if (Size > 64)
15975 return false;
15976
15977 if (Size == 16 && !Subtarget->has16BitInsts())
15978 return false;
15979
15980 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15981 Val = C->getSExtValue();
15982 return true;
15983 }
15984 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15985 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15986 return true;
15987 }
15988 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15989 if (Size != 16 || Op.getNumOperands() != 2)
15990 return false;
15991 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15992 return false;
15993 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15994 Val = C->getSExtValue();
15995 return true;
15996 }
15997 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15998 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15999 return true;
16000 }
16001 }
16002
16003 return false;
16004}
16005
16007 uint64_t Val) const {
16008 if (Constraint.size() == 1) {
16009 switch (Constraint[0]) {
16010 case 'I':
16012 case 'J':
16013 return isInt<16>(Val);
16014 case 'A':
16015 return checkAsmConstraintValA(Op, Val);
16016 case 'B':
16017 return isInt<32>(Val);
16018 case 'C':
16019 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16021 default:
16022 break;
16023 }
16024 } else if (Constraint.size() == 2) {
16025 if (Constraint == "DA") {
16026 int64_t HiBits = static_cast<int32_t>(Val >> 32);
16027 int64_t LoBits = static_cast<int32_t>(Val);
16028 return checkAsmConstraintValA(Op, HiBits, 32) &&
16029 checkAsmConstraintValA(Op, LoBits, 32);
16030 }
16031 if (Constraint == "DB") {
16032 return true;
16033 }
16034 }
16035 llvm_unreachable("Invalid asm constraint");
16036}
16037
16039 unsigned MaxSize) const {
16040 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16041 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16042 if (Size == 16) {
16043 MVT VT = Op.getSimpleValueType();
16044 switch (VT.SimpleTy) {
16045 default:
16046 return false;
16047 case MVT::i16:
16048 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16049 case MVT::f16:
16050 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16051 case MVT::bf16:
16052 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16053 case MVT::v2i16:
16054 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16055 case MVT::v2f16:
16056 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16057 case MVT::v2bf16:
16058 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16059 }
16060 }
16061 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16062 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16063 return true;
16064 return false;
16065}
16066
16067static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16068 switch (UnalignedClassID) {
16069 case AMDGPU::VReg_64RegClassID:
16070 return AMDGPU::VReg_64_Align2RegClassID;
16071 case AMDGPU::VReg_96RegClassID:
16072 return AMDGPU::VReg_96_Align2RegClassID;
16073 case AMDGPU::VReg_128RegClassID:
16074 return AMDGPU::VReg_128_Align2RegClassID;
16075 case AMDGPU::VReg_160RegClassID:
16076 return AMDGPU::VReg_160_Align2RegClassID;
16077 case AMDGPU::VReg_192RegClassID:
16078 return AMDGPU::VReg_192_Align2RegClassID;
16079 case AMDGPU::VReg_224RegClassID:
16080 return AMDGPU::VReg_224_Align2RegClassID;
16081 case AMDGPU::VReg_256RegClassID:
16082 return AMDGPU::VReg_256_Align2RegClassID;
16083 case AMDGPU::VReg_288RegClassID:
16084 return AMDGPU::VReg_288_Align2RegClassID;
16085 case AMDGPU::VReg_320RegClassID:
16086 return AMDGPU::VReg_320_Align2RegClassID;
16087 case AMDGPU::VReg_352RegClassID:
16088 return AMDGPU::VReg_352_Align2RegClassID;
16089 case AMDGPU::VReg_384RegClassID:
16090 return AMDGPU::VReg_384_Align2RegClassID;
16091 case AMDGPU::VReg_512RegClassID:
16092 return AMDGPU::VReg_512_Align2RegClassID;
16093 case AMDGPU::VReg_1024RegClassID:
16094 return AMDGPU::VReg_1024_Align2RegClassID;
16095 case AMDGPU::AReg_64RegClassID:
16096 return AMDGPU::AReg_64_Align2RegClassID;
16097 case AMDGPU::AReg_96RegClassID:
16098 return AMDGPU::AReg_96_Align2RegClassID;
16099 case AMDGPU::AReg_128RegClassID:
16100 return AMDGPU::AReg_128_Align2RegClassID;
16101 case AMDGPU::AReg_160RegClassID:
16102 return AMDGPU::AReg_160_Align2RegClassID;
16103 case AMDGPU::AReg_192RegClassID:
16104 return AMDGPU::AReg_192_Align2RegClassID;
16105 case AMDGPU::AReg_256RegClassID:
16106 return AMDGPU::AReg_256_Align2RegClassID;
16107 case AMDGPU::AReg_512RegClassID:
16108 return AMDGPU::AReg_512_Align2RegClassID;
16109 case AMDGPU::AReg_1024RegClassID:
16110 return AMDGPU::AReg_1024_Align2RegClassID;
16111 default:
16112 return -1;
16113 }
16114}
16115
16116// Figure out which registers should be reserved for stack access. Only after
16117// the function is legalized do we know all of the non-spill stack objects or if
16118// calls are present.
16122 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16123 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16124 const SIInstrInfo *TII = ST.getInstrInfo();
16125
16126 if (Info->isEntryFunction()) {
16127 // Callable functions have fixed registers used for stack access.
16129 }
16130
16131 // TODO: Move this logic to getReservedRegs()
16132 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16133 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16134 Register SReg = ST.isWave32()
16135 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16136 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16137 &AMDGPU::SGPR_64RegClass);
16138 Info->setSGPRForEXECCopy(SReg);
16139
16140 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16141 Info->getStackPtrOffsetReg()));
16142 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16143 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16144
16145 // We need to worry about replacing the default register with itself in case
16146 // of MIR testcases missing the MFI.
16147 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16148 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16149
16150 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16151 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16152
16153 Info->limitOccupancy(MF);
16154
16155 if (ST.isWave32() && !MF.empty()) {
16156 for (auto &MBB : MF) {
16157 for (auto &MI : MBB) {
16158 TII->fixImplicitOperands(MI);
16159 }
16160 }
16161 }
16162
16163 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16164 // classes if required. Ideally the register class constraints would differ
16165 // per-subtarget, but there's no easy way to achieve that right now. This is
16166 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16167 // from using them as the register class for legal types.
16168 if (ST.needsAlignedVGPRs()) {
16169 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16170 const Register Reg = Register::index2VirtReg(I);
16171 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16172 if (!RC)
16173 continue;
16174 int NewClassID = getAlignedAGPRClassID(RC->getID());
16175 if (NewClassID != -1)
16176 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16177 }
16178 }
16179
16181}
16182
16184 KnownBits &Known,
16185 const APInt &DemandedElts,
16186 const SelectionDAG &DAG,
16187 unsigned Depth) const {
16188 Known.resetAll();
16189 unsigned Opc = Op.getOpcode();
16190 switch (Opc) {
16192 unsigned IID = Op.getConstantOperandVal(0);
16193 switch (IID) {
16194 case Intrinsic::amdgcn_mbcnt_lo:
16195 case Intrinsic::amdgcn_mbcnt_hi: {
16196 const GCNSubtarget &ST =
16198 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16199 // most 31 + src1.
16200 Known.Zero.setBitsFrom(
16201 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16202 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16203 Known = KnownBits::add(Known, Known2);
16204 return;
16205 }
16206 }
16207 break;
16208 }
16209 }
16211 Op, Known, DemandedElts, DAG, Depth);
16212}
16213
16215 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16217
16218 // Set the high bits to zero based on the maximum allowed scratch size per
16219 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16220 // calculation won't overflow, so assume the sign bit is never set.
16221 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16222}
16223
16225 KnownBits &Known, unsigned Dim) {
16226 unsigned MaxValue =
16227 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16228 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16229}
16230
16232 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16233 const MachineRegisterInfo &MRI, unsigned Depth) const {
16234 const MachineInstr *MI = MRI.getVRegDef(R);
16235 switch (MI->getOpcode()) {
16236 case AMDGPU::G_INTRINSIC:
16237 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16238 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16239 switch (IID) {
16240 case Intrinsic::amdgcn_workitem_id_x:
16241 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16242 break;
16243 case Intrinsic::amdgcn_workitem_id_y:
16244 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16245 break;
16246 case Intrinsic::amdgcn_workitem_id_z:
16247 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16248 break;
16249 case Intrinsic::amdgcn_mbcnt_lo:
16250 case Intrinsic::amdgcn_mbcnt_hi: {
16251 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16252 // most 31 + src1.
16253 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16254 ? getSubtarget()->getWavefrontSizeLog2()
16255 : 5);
16256 KnownBits Known2;
16257 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16258 Depth + 1);
16259 Known = KnownBits::add(Known, Known2);
16260 break;
16261 }
16262 case Intrinsic::amdgcn_groupstaticsize: {
16263 // We can report everything over the maximum size as 0. We can't report
16264 // based on the actual size because we don't know if it's accurate or not
16265 // at any given point.
16266 Known.Zero.setHighBits(
16267 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16268 break;
16269 }
16270 }
16271 break;
16272 }
16273 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16274 Known.Zero.setHighBits(24);
16275 break;
16276 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16277 Known.Zero.setHighBits(16);
16278 break;
16279 case AMDGPU::G_AMDGPU_SMED3:
16280 case AMDGPU::G_AMDGPU_UMED3: {
16281 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16282
16283 KnownBits Known2;
16284 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16285 if (Known2.isUnknown())
16286 break;
16287
16288 KnownBits Known1;
16289 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16290 if (Known1.isUnknown())
16291 break;
16292
16293 KnownBits Known0;
16294 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16295 if (Known0.isUnknown())
16296 break;
16297
16298 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16299 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16300 Known.One = Known0.One & Known1.One & Known2.One;
16301 break;
16302 }
16303 }
16304}
16305
16308 unsigned Depth) const {
16309 const MachineInstr *MI = MRI.getVRegDef(R);
16310 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16311 // FIXME: Can this move to generic code? What about the case where the call
16312 // site specifies a lower alignment?
16313 Intrinsic::ID IID = GI->getIntrinsicID();
16315 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16316 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16317 return *RetAlign;
16318 }
16319 return Align(1);
16320}
16321
16324 const Align CacheLineAlign = Align(64);
16325
16326 // Pre-GFX10 target did not benefit from loop alignment
16327 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16328 getSubtarget()->hasInstFwdPrefetchBug())
16329 return PrefAlign;
16330
16331 // On GFX10 I$ is 4 x 64 bytes cache lines.
16332 // By default prefetcher keeps one cache line behind and reads two ahead.
16333 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16334 // behind and one ahead.
16335 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16336 // If loop fits 64 bytes it always spans no more than two cache lines and
16337 // does not need an alignment.
16338 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16339 // Else if loop is less or equal 192 bytes we need two lines behind.
16340
16342 const MachineBasicBlock *Header = ML->getHeader();
16343 if (Header->getAlignment() != PrefAlign)
16344 return Header->getAlignment(); // Already processed.
16345
16346 unsigned LoopSize = 0;
16347 for (const MachineBasicBlock *MBB : ML->blocks()) {
16348 // If inner loop block is aligned assume in average half of the alignment
16349 // size to be added as nops.
16350 if (MBB != Header)
16351 LoopSize += MBB->getAlignment().value() / 2;
16352
16353 for (const MachineInstr &MI : *MBB) {
16354 LoopSize += TII->getInstSizeInBytes(MI);
16355 if (LoopSize > 192)
16356 return PrefAlign;
16357 }
16358 }
16359
16360 if (LoopSize <= 64)
16361 return PrefAlign;
16362
16363 if (LoopSize <= 128)
16364 return CacheLineAlign;
16365
16366 // If any of parent loops is surrounded by prefetch instructions do not
16367 // insert new for inner loop, which would reset parent's settings.
16368 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16369 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16370 auto I = Exit->getFirstNonDebugInstr();
16371 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16372 return CacheLineAlign;
16373 }
16374 }
16375
16376 MachineBasicBlock *Pre = ML->getLoopPreheader();
16377 MachineBasicBlock *Exit = ML->getExitBlock();
16378
16379 if (Pre && Exit) {
16380 auto PreTerm = Pre->getFirstTerminator();
16381 if (PreTerm == Pre->begin() ||
16382 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16383 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16384 .addImm(1); // prefetch 2 lines behind PC
16385
16386 auto ExitHead = Exit->getFirstNonDebugInstr();
16387 if (ExitHead == Exit->end() ||
16388 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16389 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16390 .addImm(2); // prefetch 1 line behind PC
16391 }
16392
16393 return CacheLineAlign;
16394}
16395
16397static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16398 assert(N->getOpcode() == ISD::CopyFromReg);
16399 do {
16400 // Follow the chain until we find an INLINEASM node.
16401 N = N->getOperand(0).getNode();
16402 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16403 return true;
16404 } while (N->getOpcode() == ISD::CopyFromReg);
16405 return false;
16406}
16407
16410 UniformityInfo *UA) const {
16411 switch (N->getOpcode()) {
16412 case ISD::CopyFromReg: {
16413 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16414 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16415 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16416 Register Reg = R->getReg();
16417
16418 // FIXME: Why does this need to consider isLiveIn?
16419 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16420 return !TRI->isSGPRReg(MRI, Reg);
16421
16422 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16423 return UA->isDivergent(V);
16424
16425 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16426 return !TRI->isSGPRReg(MRI, Reg);
16427 }
16428 case ISD::LOAD: {
16429 const LoadSDNode *L = cast<LoadSDNode>(N);
16430 unsigned AS = L->getAddressSpace();
16431 // A flat load may access private memory.
16433 }
16434 case ISD::CALLSEQ_END:
16435 return true;
16437 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16439 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16458 // Target-specific read-modify-write atomics are sources of divergence.
16459 return true;
16460 default:
16461 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16462 // Generic read-modify-write atomics are sources of divergence.
16463 return A->readMem() && A->writeMem();
16464 }
16465 return false;
16466 }
16467}
16468
16470 EVT VT) const {
16471 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16472 case MVT::f32:
16474 case MVT::f64:
16475 case MVT::f16:
16477 default:
16478 return false;
16479 }
16480}
16481
16483 LLT Ty, const MachineFunction &MF) const {
16484 switch (Ty.getScalarSizeInBits()) {
16485 case 32:
16486 return !denormalModeIsFlushAllF32(MF);
16487 case 64:
16488 case 16:
16489 return !denormalModeIsFlushAllF64F16(MF);
16490 default:
16491 return false;
16492 }
16493}
16494
16496 const SelectionDAG &DAG,
16497 bool SNaN,
16498 unsigned Depth) const {
16499 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16500 const MachineFunction &MF = DAG.getMachineFunction();
16502
16503 if (Info->getMode().DX10Clamp)
16504 return true; // Clamped to 0.
16505 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16506 }
16507
16509 Depth);
16510}
16511
16512// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16513// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16515 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16516 return true;
16517
16519 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16520 if (DenormMode == DenormalMode::getPreserveSign())
16521 return true;
16522
16523 // TODO: Remove this.
16524 return RMW->getFunction()
16525 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16526 .getValueAsBool();
16527}
16528
16530 LLVMContext &Ctx = RMW->getContext();
16531 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16532 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16533
16534 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16535 << "Hardware instruction generated for atomic "
16536 << RMW->getOperationName(RMW->getOperation())
16537 << " operation at memory scope " << MemScope;
16538}
16539
16540static bool isV2F16OrV2BF16(Type *Ty) {
16541 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16542 Type *EltTy = VT->getElementType();
16543 return VT->getNumElements() == 2 &&
16544 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16545 }
16546
16547 return false;
16548}
16549
16550static bool isV2F16(Type *Ty) {
16551 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16552 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16553}
16554
16555static bool isV2BF16(Type *Ty) {
16556 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16557 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16558}
16559
16560/// \return true if atomicrmw integer ops work for the type.
16561static bool isAtomicRMWLegalIntTy(Type *Ty) {
16562 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16563 unsigned BW = IT->getBitWidth();
16564 return BW == 32 || BW == 64;
16565 }
16566
16567 return false;
16568}
16569
16570/// \return true if this atomicrmw xchg type can be selected.
16571static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16572 Type *Ty = RMW->getType();
16573 if (isAtomicRMWLegalIntTy(Ty))
16574 return true;
16575
16576 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16577 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16578 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16579 return BW == 32 || BW == 64;
16580 }
16581
16582 if (Ty->isFloatTy() || Ty->isDoubleTy())
16583 return true;
16584
16585 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16586 return VT->getNumElements() == 2 &&
16587 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16588 }
16589
16590 return false;
16591}
16592
16593/// \returns true if it's valid to emit a native instruction for \p RMW, based
16594/// on the properties of the target memory.
16595static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16596 const AtomicRMWInst *RMW,
16597 bool HasSystemScope) {
16598 // The remote/fine-grained access logic is different from the integer
16599 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16600 // fine-grained access does not work, even for a device local allocation.
16601 //
16602 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16603 // allocations work.
16604 if (HasSystemScope) {
16606 RMW->hasMetadata("amdgpu.no.remote.memory"))
16607 return true;
16609 return true;
16610
16611 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16612}
16613
16614/// \return Action to perform on AtomicRMWInsts for integer operations.
16617 return isAtomicRMWLegalIntTy(RMW->getType())
16620}
16621
16622/// Return if a flat address space atomicrmw can access private memory.
16624 const MDNode *NoaliasAddrSpaceMD =
16625 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16626 if (!NoaliasAddrSpaceMD)
16627 return true;
16628
16629 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16630 ++I) {
16631 auto *Low = mdconst::extract<ConstantInt>(
16632 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16633 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16634 auto *High = mdconst::extract<ConstantInt>(
16635 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16636 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16637 }
16638 }
16639
16640 return true;
16641}
16642
16645 unsigned AS = RMW->getPointerAddressSpace();
16646 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16648
16649 // 64-bit flat atomics that dynamically reside in private memory will silently
16650 // be dropped.
16651 //
16652 // Note that we will emit a new copy of the original atomic in the expansion,
16653 // which will be incrementally relegalized.
16654 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16655 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16656 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16659
16660 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16662 ORE.emit([=]() {
16663 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16664 });
16665 return Kind;
16666 };
16667
16668 auto SSID = RMW->getSyncScopeID();
16669 bool HasSystemScope =
16670 SSID == SyncScope::System ||
16671 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16672
16673 auto Op = RMW->getOperation();
16674 switch (Op) {
16675 case AtomicRMWInst::Xchg: {
16676 // PCIe supports add and xchg for system atomics.
16677 return isAtomicRMWLegalXChgTy(RMW)
16680 }
16681 case AtomicRMWInst::Add:
16682 case AtomicRMWInst::And:
16686 case AtomicRMWInst::Sub:
16687 case AtomicRMWInst::Or:
16688 case AtomicRMWInst::Xor: {
16689 // Atomic sub/or/xor do not work over PCI express, but atomic add
16690 // does. InstCombine transforms these with 0 to or, so undo that.
16691 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16692 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16693 ConstVal && ConstVal->isNullValue())
16695 }
16696
16698 }
16699 case AtomicRMWInst::FAdd: {
16700 Type *Ty = RMW->getType();
16701
16702 // TODO: Handle REGION_ADDRESS
16703 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16704 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16705 // is fixed to round-to-nearest-even.
16706 //
16707 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16708 // round-to-nearest-even.
16709 //
16710 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16711 // suggests it is OK if the floating-point mode may not match the calling
16712 // thread.
16713 if (Ty->isFloatTy()) {
16716 }
16717
16718 if (Ty->isDoubleTy()) {
16719 // Ignores denormal mode, but we don't consider flushing mandatory.
16722 }
16723
16724 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16726
16728 }
16729
16730 // LDS atomics respect the denormal mode from the mode register.
16731 //
16732 // Traditionally f32 global/buffer memory atomics would unconditionally
16733 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16734 // flush.
16735 //
16736 // On targets with flat atomic fadd, denormals would flush depending on
16737 // whether the target address resides in LDS or global memory. We consider
16738 // this flat-maybe-flush as will-flush.
16739 if (Ty->isFloatTy() &&
16743
16744 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16745 // safe. The message phrasing also should be better.
16746 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16747 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16748 // gfx940, gfx12
16749 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16750 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16751 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16752 // gfx90a, gfx940, gfx12
16753 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16754 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16755
16756 // gfx940, gfx12
16757 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16758 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16759 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16760 // gfx90a, gfx940, gfx12
16761 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16762 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16763
16764 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16765 // buffer. gfx12 does have the buffer version.
16766 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16767 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16768 }
16769
16770 // global and flat atomic fadd f64: gfx90a, gfx940.
16771 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16772 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16773
16774 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16775 if (Ty->isFloatTy()) {
16776 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16777 // gfx11+.
16778 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16779 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16780 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16781 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16782 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16783 } else {
16784 // gfx908
16785 if (RMW->use_empty() &&
16787 isV2F16(Ty))
16788 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16789 }
16790 }
16791
16792 // flat atomic fadd f32: gfx940, gfx11+.
16793 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16794 if (Subtarget->hasFlatAtomicFaddF32Inst())
16795 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16796
16797 // If it is in flat address space, and the type is float, we will try to
16798 // expand it, if the target supports global and lds atomic fadd. The
16799 // reason we need that is, in the expansion, we emit the check of
16800 // address space. If it is in global address space, we emit the global
16801 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16802 // fadd.
16803 if (Subtarget->hasLDSFPAtomicAddF32()) {
16804 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16806 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16808 }
16809 }
16810 }
16811
16813 }
16815 case AtomicRMWInst::FMax: {
16816 Type *Ty = RMW->getType();
16817
16818 // LDS float and double fmin/fmax were always supported.
16819 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16820 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16822 }
16823
16824 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16825 // For flat and global cases:
16826 // float, double in gfx7. Manual claims denormal support.
16827 // Removed in gfx8.
16828 // float, double restored in gfx10.
16829 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16830 //
16831 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16832 // no f32.
16833 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16834 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16835 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16836 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16837 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16838 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16840 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16841 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16842 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16843 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16844 }
16845 }
16846
16848 }
16849 case AtomicRMWInst::Min:
16850 case AtomicRMWInst::Max:
16852 case AtomicRMWInst::UMax: {
16855 // Always expand system scope min/max atomics.
16856 if (HasSystemScope)
16858 }
16859
16861 }
16864 default:
16866 }
16867
16868 llvm_unreachable("covered atomicrmw op switch");
16869}
16870
16876}
16877
16880 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16883}
16884
16887 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16888 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16890
16891 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16893
16894 const DataLayout &DL = CmpX->getDataLayout();
16895
16896 Type *ValTy = CmpX->getNewValOperand()->getType();
16897
16898 // If a 64-bit flat atomic may alias private, we need to avoid using the
16899 // atomic in the private case.
16900 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16902}
16903
16904const TargetRegisterClass *
16905SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16907 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16908 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16909 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16910 : &AMDGPU::SReg_32RegClass;
16911 if (!TRI->isSGPRClass(RC) && !isDivergent)
16912 return TRI->getEquivalentSGPRClass(RC);
16913 if (TRI->isSGPRClass(RC) && isDivergent)
16914 return TRI->getEquivalentVGPRClass(RC);
16915
16916 return RC;
16917}
16918
16919// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16920// uniform values (as produced by the mask results of control flow intrinsics)
16921// used outside of divergent blocks. The phi users need to also be treated as
16922// always uniform.
16923//
16924// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16925static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16926 unsigned WaveSize) {
16927 // FIXME: We assume we never cast the mask results of a control flow
16928 // intrinsic.
16929 // Early exit if the type won't be consistent as a compile time hack.
16930 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16931 if (!IT || IT->getBitWidth() != WaveSize)
16932 return false;
16933
16934 if (!isa<Instruction>(V))
16935 return false;
16936 if (!Visited.insert(V).second)
16937 return false;
16938 bool Result = false;
16939 for (const auto *U : V->users()) {
16940 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16941 if (V == U->getOperand(1)) {
16942 switch (Intrinsic->getIntrinsicID()) {
16943 default:
16944 Result = false;
16945 break;
16946 case Intrinsic::amdgcn_if_break:
16947 case Intrinsic::amdgcn_if:
16948 case Intrinsic::amdgcn_else:
16949 Result = true;
16950 break;
16951 }
16952 }
16953 if (V == U->getOperand(0)) {
16954 switch (Intrinsic->getIntrinsicID()) {
16955 default:
16956 Result = false;
16957 break;
16958 case Intrinsic::amdgcn_end_cf:
16959 case Intrinsic::amdgcn_loop:
16960 Result = true;
16961 break;
16962 }
16963 }
16964 } else {
16965 Result = hasCFUser(U, Visited, WaveSize);
16966 }
16967 if (Result)
16968 break;
16969 }
16970 return Result;
16971}
16972
16974 const Value *V) const {
16975 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16976 if (CI->isInlineAsm()) {
16977 // FIXME: This cannot give a correct answer. This should only trigger in
16978 // the case where inline asm returns mixed SGPR and VGPR results, used
16979 // outside the defining block. We don't have a specific result to
16980 // consider, so this assumes if any value is SGPR, the overall register
16981 // also needs to be SGPR.
16982 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16984 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16985 for (auto &TC : TargetConstraints) {
16986 if (TC.Type == InlineAsm::isOutput) {
16988 const TargetRegisterClass *RC =
16989 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
16990 TC.ConstraintVT)
16991 .second;
16992 if (RC && SIRI->isSGPRClass(RC))
16993 return true;
16994 }
16995 }
16996 }
16997 }
16999 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
17000}
17001
17003 for (SDUse &Use : N->uses()) {
17004 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
17005 if (getBasePtrIndex(M) == Use.getOperandNo())
17006 return true;
17007 }
17008 }
17009 return false;
17010}
17011
17013 SDValue N1) const {
17014 if (!N0.hasOneUse())
17015 return false;
17016 // Take care of the opportunity to keep N0 uniform
17017 if (N0->isDivergent() || !N1->isDivergent())
17018 return true;
17019 // Check if we have a good chance to form the memory access pattern with the
17020 // base and offset
17021 return (DAG.isBaseWithConstantOffset(N0) &&
17023}
17024
17026 Register N0, Register N1) const {
17027 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17028}
17029
17032 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17034 if (I.getMetadata("amdgpu.noclobber"))
17035 Flags |= MONoClobber;
17036 if (I.getMetadata("amdgpu.last.use"))
17037 Flags |= MOLastUse;
17038 return Flags;
17039}
17040
17042 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17043 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
17044 if (User->getOpcode() != ISD::CopyToReg)
17045 return false;
17046 if (!Def->isMachineOpcode())
17047 return false;
17048 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17049 if (!MDef)
17050 return false;
17051
17052 unsigned ResNo = User->getOperand(Op).getResNo();
17053 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17054 return false;
17055 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17056 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17057 PhysReg = AMDGPU::SCC;
17058 const TargetRegisterClass *RC =
17059 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17060 Cost = RC->getCopyCost();
17061 return true;
17062 }
17063 return false;
17064}
17065
17066/// Check if it is profitable to hoist instruction in then/else to if.
17068 if (!I->hasOneUse())
17069 return true;
17070
17071 Instruction *User = I->user_back();
17072 // TODO: Add more patterns that are not profitable to hoist and
17073 // handle modifiers such as fabs and fneg
17074 switch (I->getOpcode()) {
17075 case Instruction::FMul: {
17076 if (User->getOpcode() != Instruction::FSub &&
17077 User->getOpcode() != Instruction::FAdd)
17078 return true;
17079
17081
17082 return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17083 Options.AllowFPOpFusion != FPOpFusion::Fast &&
17084 !Options.UnsafeFPMath) ||
17085 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17086 }
17087 default:
17088 return true;
17089 }
17090 return true;
17091}
17092
17094 Instruction *AI) const {
17095 // Given: atomicrmw fadd ptr %addr, float %val ordering
17096 //
17097 // With this expansion we produce the following code:
17098 // [...]
17099 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17100 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17101 //
17102 // atomicrmw.shared:
17103 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17104 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17105 // float %val ordering
17106 // br label %atomicrmw.phi
17107 //
17108 // atomicrmw.check.private:
17109 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17110 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17111 //
17112 // atomicrmw.private:
17113 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17114 // %loaded.private = load float, ptr addrspace(5) %cast.private
17115 // %val.new = fadd float %loaded.private, %val
17116 // store float %val.new, ptr addrspace(5) %cast.private
17117 // br label %atomicrmw.phi
17118 //
17119 // atomicrmw.global:
17120 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17121 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17122 // float %val ordering
17123 // br label %atomicrmw.phi
17124 //
17125 // atomicrmw.phi:
17126 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17127 // [ %loaded.private, %atomicrmw.private ],
17128 // [ %loaded.global, %atomicrmw.global ]
17129 // br label %atomicrmw.end
17130 //
17131 // atomicrmw.end:
17132 // [...]
17133 //
17134 //
17135 // For 64-bit atomics which may reside in private memory, we perform a simpler
17136 // version that only inserts the private check, and uses the flat operation.
17137
17138 IRBuilder<> Builder(AI);
17139 LLVMContext &Ctx = Builder.getContext();
17140
17141 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17142 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17144 Value *Addr = AI->getOperand(PtrOpIdx);
17145
17146 /// TODO: Only need to check private, then emit flat-known-not private (no
17147 /// need for shared block, or cast to global).
17148 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17149
17150 Align Alignment;
17151 if (RMW)
17152 Alignment = RMW->getAlign();
17153 else if (CX)
17154 Alignment = CX->getAlign();
17155 else
17156 llvm_unreachable("unhandled atomic operation");
17157
17158 // FullFlatEmulation is true if we need to issue the private, shared, and
17159 // global cases.
17160 //
17161 // If this is false, we are only dealing with the flat-targeting-private case,
17162 // where we only insert a check for private and still use the flat instruction
17163 // for global and shared.
17164
17165 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17166 Subtarget->hasAtomicFaddInsts() &&
17167 RMW->getType()->isFloatTy();
17168
17169 // If the return value isn't used, do not introduce a false use in the phi.
17170 bool ReturnValueIsUsed = !AI->use_empty();
17171
17172 BasicBlock *BB = Builder.GetInsertBlock();
17173 Function *F = BB->getParent();
17174 BasicBlock *ExitBB =
17175 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17176 BasicBlock *SharedBB = nullptr;
17177
17178 BasicBlock *CheckPrivateBB = BB;
17179 if (FullFlatEmulation) {
17180 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17181 CheckPrivateBB =
17182 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17183 }
17184
17185 BasicBlock *PrivateBB =
17186 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17187 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17188 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17189
17190 std::prev(BB->end())->eraseFromParent();
17191 Builder.SetInsertPoint(BB);
17192
17193 Value *LoadedShared = nullptr;
17194 if (FullFlatEmulation) {
17195 CallInst *IsShared = Builder.CreateIntrinsic(
17196 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17197 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17198 Builder.SetInsertPoint(SharedBB);
17199 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17201
17202 Instruction *Clone = AI->clone();
17203 Clone->insertInto(SharedBB, SharedBB->end());
17204 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17205 LoadedShared = Clone;
17206
17207 Builder.CreateBr(PhiBB);
17208 Builder.SetInsertPoint(CheckPrivateBB);
17209 }
17210
17211 CallInst *IsPrivate = Builder.CreateIntrinsic(
17212 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17213 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17214
17215 Builder.SetInsertPoint(PrivateBB);
17216
17217 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17219
17220 Value *LoadedPrivate;
17221 if (RMW) {
17222 LoadedPrivate = Builder.CreateAlignedLoad(
17223 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17224
17225 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17226 LoadedPrivate, RMW->getValOperand());
17227
17228 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17229 } else {
17230 auto [ResultLoad, Equal] =
17231 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17232 CX->getNewValOperand(), CX->getAlign());
17233
17234 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17235 ResultLoad, 0);
17236 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17237 }
17238
17239 Builder.CreateBr(PhiBB);
17240
17241 Builder.SetInsertPoint(GlobalBB);
17242
17243 // Continue using a flat instruction if we only emitted the check for private.
17244 Instruction *LoadedGlobal = AI;
17245 if (FullFlatEmulation) {
17246 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17248 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17249 }
17250
17251 AI->removeFromParent();
17252 AI->insertInto(GlobalBB, GlobalBB->end());
17253
17254 // The new atomicrmw may go through another round of legalization later.
17255 if (!FullFlatEmulation) {
17256 // We inserted the runtime check already, make sure we do not try to
17257 // re-expand this.
17258 // TODO: Should union with any existing metadata.
17259 MDBuilder MDB(F->getContext());
17260 MDNode *RangeNotPrivate =
17263 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17264 RangeNotPrivate);
17265 }
17266
17267 Builder.CreateBr(PhiBB);
17268
17269 Builder.SetInsertPoint(PhiBB);
17270
17271 if (ReturnValueIsUsed) {
17272 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17273 AI->replaceAllUsesWith(Loaded);
17274 if (FullFlatEmulation)
17275 Loaded->addIncoming(LoadedShared, SharedBB);
17276 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17277 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17278 Loaded->takeName(AI);
17279 }
17280
17281 Builder.CreateBr(ExitBB);
17282}
17283
17286
17289 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17290 ConstVal && ConstVal->isNullValue()) {
17291 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17293
17294 // We may still need the private-alias-flat handling below.
17295
17296 // TODO: Skip this for cases where we cannot access remote memory.
17297 }
17298 }
17299
17300 // The non-flat expansions should only perform the de-canonicalization of
17301 // identity values.
17303 return;
17304
17306}
17307
17310}
17311
17312LoadInst *
17314 IRBuilder<> Builder(AI);
17315 auto Order = AI->getOrdering();
17316
17317 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17318 // must be flushed if the atomic ordering had a release semantics. This is
17319 // not necessary a fence, a release fence just coincides to do that flush.
17320 // Avoid replacing of an atomicrmw with a release semantics.
17321 if (isReleaseOrStronger(Order))
17322 return nullptr;
17323
17324 LoadInst *LI = Builder.CreateAlignedLoad(
17325 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17326 LI->setAtomic(Order, AI->getSyncScopeID());
17327 LI->copyMetadata(*AI);
17328 LI->takeName(AI);
17329 AI->replaceAllUsesWith(LI);
17330 AI->eraseFromParent();
17331 return LI;
17332}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1122
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
bool isNegative() const
Definition: APFloat.h:1445
APInt bitcastToAPInt() const
Definition: APFloat.h:1351
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
bool isInfinity() const
Definition: APFloat.h:1442
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1451
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
bool hasPkMovB32() const
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:512
unsigned getAddressSpace() const
Definition: GlobalValue.h:206
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
Type * getValueType() const
Definition: GlobalValue.h:297
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LLVMContext & getContext() const
Definition: IRBuilder.h:195
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:70
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:586
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:983
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
const Pass * getPass() const
Definition: SelectionDAG.h:493
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:713
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1494
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1643
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1590
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1612
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:55
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:246
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:60
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:557
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:287
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:237
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals