LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426 Custom);
427
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
433 Custom);
434
435 // Deal with vec3 vector operations when widened to vec4.
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
438
439 // Deal with vec5/6/7 vector operations when widened to vec8.
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
450
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
454 Expand);
455
456 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
457
458 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
459
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
461 // illegal.
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
464
465 // On SI this is s_memtime and s_memrealtime on VI.
467
468 if (Subtarget->hasSMemRealTime() ||
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
632 case ISD::IS_FPCLASS:
633 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
744 Subtarget->hasVOP3PInsts() ? Legal : Custom);
745
746 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
750
753
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 } else {
859 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
860 if (Subtarget->hasMinimum3Maximum3F32())
862
863 if (Subtarget->hasMinimum3Maximum3PKF16())
865 }
866
868 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
869 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
870 MVT::i8},
871 Custom);
872
874 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
875 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
876 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
877 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Custom);
879
881 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
882 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
883 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
884 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
885 Custom);
886
892
893 // TODO: Could move this to custom lowering, could benefit from combines on
894 // extract of relevant bits.
896
898
899 if (Subtarget->hasBF16ConversionInsts()) {
903 }
904
905 if (Subtarget->hasCvtPkF16F32Inst()) {
907 }
908
911 ISD::SUB,
913 ISD::MUL,
914 ISD::FADD,
915 ISD::FSUB,
916 ISD::FDIV,
917 ISD::FMUL,
924 ISD::FMA,
925 ISD::SMIN,
926 ISD::SMAX,
927 ISD::UMIN,
928 ISD::UMAX,
931 ISD::SMIN,
932 ISD::SMAX,
933 ISD::UMIN,
934 ISD::UMAX,
935 ISD::AND,
936 ISD::OR,
937 ISD::XOR,
938 ISD::SHL,
939 ISD::SRL,
940 ISD::SRA,
941 ISD::FSHR,
951
952 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
954
955 // All memory operations. Some folding on the pointer operand is done to help
956 // matching the constant offsets in the addressing modes.
981
982 // FIXME: In other contexts we pretend this is a per-function property.
984
986}
987
988const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
989
991 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
992 return RCRegs;
993}
994
995//===----------------------------------------------------------------------===//
996// TargetLowering queries
997//===----------------------------------------------------------------------===//
998
999// v_mad_mix* support a conversion from f16 to f32.
1000//
1001// There is only one special case when denormals are enabled we don't currently,
1002// where this is OK to use.
1003bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1004 EVT DestVT, EVT SrcVT) const {
1005 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1006 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1007 DestVT.getScalarType() == MVT::f32 &&
1008 SrcVT.getScalarType() == MVT::f16 &&
1009 // TODO: This probably only requires no input flushing?
1011}
1012
1014 LLT DestTy, LLT SrcTy) const {
1015 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1016 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1017 DestTy.getScalarSizeInBits() == 32 &&
1018 SrcTy.getScalarSizeInBits() == 16 &&
1019 // TODO: This probably only requires no input flushing?
1020 denormalModeIsFlushAllF32(*MI.getMF());
1021}
1022
1024 // SI has some legal vector types, but no legal vector operations. Say no
1025 // shuffles are legal in order to prefer scalarizing some vector operations.
1026 return false;
1027}
1028
1031 EVT VT) const {
1034
1035 if (VT.isVector()) {
1036 EVT ScalarVT = VT.getScalarType();
1037 unsigned Size = ScalarVT.getSizeInBits();
1038 if (Size == 16) {
1039 if (Subtarget->has16BitInsts()) {
1040 if (VT.isInteger())
1041 return MVT::v2i16;
1042 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1043 }
1044 return VT.isInteger() ? MVT::i32 : MVT::f32;
1045 }
1046
1047 if (Size < 16)
1048 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1049 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1050 }
1051
1052 if (VT.getSizeInBits() > 32)
1053 return MVT::i32;
1054
1056}
1057
1060 EVT VT) const {
1063
1064 if (VT.isVector()) {
1065 unsigned NumElts = VT.getVectorNumElements();
1066 EVT ScalarVT = VT.getScalarType();
1067 unsigned Size = ScalarVT.getSizeInBits();
1068
1069 // FIXME: Should probably promote 8-bit vectors to i16.
1070 if (Size == 16 && Subtarget->has16BitInsts())
1071 return (NumElts + 1) / 2;
1072
1073 if (Size <= 32)
1074 return NumElts;
1075
1076 if (Size > 32)
1077 return NumElts * ((Size + 31) / 32);
1078 } else if (VT.getSizeInBits() > 32)
1079 return (VT.getSizeInBits() + 31) / 32;
1080
1082}
1083
1085 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1086 unsigned &NumIntermediates, MVT &RegisterVT) const {
1087 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1088 unsigned NumElts = VT.getVectorNumElements();
1089 EVT ScalarVT = VT.getScalarType();
1090 unsigned Size = ScalarVT.getSizeInBits();
1091 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1092 // support, but unless we can properly handle 3-vectors, it will be still be
1093 // inconsistent.
1094 if (Size == 16 && Subtarget->has16BitInsts()) {
1095 if (ScalarVT == MVT::bf16) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = MVT::v2bf16;
1098 } else {
1099 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1100 IntermediateVT = RegisterVT;
1101 }
1102 NumIntermediates = (NumElts + 1) / 2;
1103 return NumIntermediates;
1104 }
1105
1106 if (Size == 32) {
1107 RegisterVT = ScalarVT.getSimpleVT();
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts;
1110 return NumIntermediates;
1111 }
1112
1113 if (Size < 16 && Subtarget->has16BitInsts()) {
1114 // FIXME: Should probably form v2i16 pieces
1115 RegisterVT = MVT::i16;
1116 IntermediateVT = ScalarVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1119 }
1120
1121 if (Size != 16 && Size <= 32) {
1122 RegisterVT = MVT::i32;
1123 IntermediateVT = ScalarVT;
1124 NumIntermediates = NumElts;
1125 return NumIntermediates;
1126 }
1127
1128 if (Size > 32) {
1129 RegisterVT = MVT::i32;
1130 IntermediateVT = RegisterVT;
1131 NumIntermediates = NumElts * ((Size + 31) / 32);
1132 return NumIntermediates;
1133 }
1134 }
1135
1137 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1138}
1139
1141 const DataLayout &DL, Type *Ty,
1142 unsigned MaxNumLanes) {
1143 assert(MaxNumLanes != 0);
1144
1145 LLVMContext &Ctx = Ty->getContext();
1146 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1147 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1148 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1149 NumElts);
1150 }
1151
1152 return TLI.getValueType(DL, Ty);
1153}
1154
1155// Peek through TFE struct returns to only use the data size.
1157 const DataLayout &DL, Type *Ty,
1158 unsigned MaxNumLanes) {
1159 auto *ST = dyn_cast<StructType>(Ty);
1160 if (!ST)
1161 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1162
1163 // TFE intrinsics return an aggregate type.
1164 assert(ST->getNumContainedTypes() == 2 &&
1165 ST->getContainedType(1)->isIntegerTy(32));
1166 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1167}
1168
1169/// Map address space 7 to MVT::v5i32 because that's its in-memory
1170/// representation. This return value is vector-typed because there is no
1171/// MVT::i160 and it is not clear if one can be added. While this could
1172/// cause issues during codegen, these address space 7 pointers will be
1173/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1174/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1175/// modeling, to work.
1177 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1178 return MVT::v5i32;
1180 DL.getPointerSizeInBits(AS) == 192)
1181 return MVT::v6i32;
1183}
1184/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1185/// v8i32 when padding is added.
1186/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1187/// also v8i32 with padding.
1189 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1190 DL.getPointerSizeInBits(AS) == 160) ||
1192 DL.getPointerSizeInBits(AS) == 192))
1193 return MVT::v8i32;
1195}
1196
1198 const CallInst &CI,
1199 MachineFunction &MF,
1200 unsigned IntrID) const {
1202 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1204
1205 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1207 AttributeList Attr =
1209 MemoryEffects ME = Attr.getMemoryEffects();
1210 if (ME.doesNotAccessMemory())
1211 return false;
1212
1213 // TODO: Should images get their own address space?
1214 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1215
1216 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1217 if (RsrcIntr->IsImage) {
1220 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1221 Info.align.reset();
1222 }
1223
1224 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1225 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1226 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1227 // We conservatively set the memory operand of a buffer intrinsic to the
1228 // base resource pointer, so that we can access alias information about
1229 // those pointers. Cases like "this points at the same value
1230 // but with a different offset" are handled in
1231 // areMemAccessesTriviallyDisjoint.
1232 Info.ptrVal = RsrcArg;
1233 }
1234
1235 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1236 if (!IsSPrefetch) {
1237 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1238 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1240 }
1241
1243 if (ME.onlyReadsMemory()) {
1244 if (RsrcIntr->IsImage) {
1245 unsigned MaxNumLanes = 4;
1246
1247 if (!BaseOpcode->Gather4) {
1248 // If this isn't a gather, we may have excess loaded elements in the
1249 // IR type. Check the dmask for the real number of elements loaded.
1250 unsigned DMask =
1251 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1252 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1253 }
1254
1255 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1256 CI.getType(), MaxNumLanes);
1257 } else {
1258 Info.memVT =
1260 std::numeric_limits<unsigned>::max());
1261 }
1262
1263 // FIXME: What does alignment mean for an image?
1266 } else if (ME.onlyWritesMemory()) {
1268
1269 Type *DataTy = CI.getArgOperand(0)->getType();
1270 if (RsrcIntr->IsImage) {
1271 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1272 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1273 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1274 DMaskLanes);
1275 } else
1276 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1277
1279 } else {
1280 // Atomic, NoReturn Sampler or prefetch
1283 Info.flags |=
1285
1286 if (!IsSPrefetch)
1288
1289 switch (IntrID) {
1290 default:
1291 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1292 // Fake memory access type for no return sampler intrinsics
1293 Info.memVT = MVT::i32;
1294 } else {
1295 // XXX - Should this be volatile without known ordering?
1297 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1298 }
1299 break;
1300 case Intrinsic::amdgcn_raw_buffer_load_lds:
1301 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1302 case Intrinsic::amdgcn_struct_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1304 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1305 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1306 Info.ptrVal = CI.getArgOperand(1);
1307 return true;
1308 }
1309 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1310 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1311 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1313 Info.memVT =
1315 std::numeric_limits<unsigned>::max());
1316 Info.flags &= ~MachineMemOperand::MOStore;
1317 return true;
1318 }
1319 }
1320 }
1321 return true;
1322 }
1323
1324 switch (IntrID) {
1325 case Intrinsic::amdgcn_ds_ordered_add:
1326 case Intrinsic::amdgcn_ds_ordered_swap: {
1328 Info.memVT = MVT::getVT(CI.getType());
1329 Info.ptrVal = CI.getOperand(0);
1330 Info.align.reset();
1332
1333 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1334 if (!Vol->isZero())
1336
1337 return true;
1338 }
1339 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1340 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1342 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1343 Info.ptrVal = nullptr;
1344 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1346 return true;
1347 }
1348 case Intrinsic::amdgcn_ds_append:
1349 case Intrinsic::amdgcn_ds_consume: {
1351 Info.memVT = MVT::getVT(CI.getType());
1352 Info.ptrVal = CI.getOperand(0);
1353 Info.align.reset();
1355
1356 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1357 if (!Vol->isZero())
1359
1360 return true;
1361 }
1362 case Intrinsic::amdgcn_global_atomic_csub: {
1364 Info.memVT = MVT::getVT(CI.getType());
1365 Info.ptrVal = CI.getOperand(0);
1366 Info.align.reset();
1369 return true;
1370 }
1371 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1373 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1374
1375 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1376 Info.align.reset();
1377 Info.flags |=
1379 return true;
1380 }
1381 case Intrinsic::amdgcn_global_atomic_fmin_num:
1382 case Intrinsic::amdgcn_global_atomic_fmax_num:
1383 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1384 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1385 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1386 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1388 Info.memVT = MVT::getVT(CI.getType());
1389 Info.ptrVal = CI.getOperand(0);
1390 Info.align.reset();
1394 return true;
1395 }
1396 case Intrinsic::amdgcn_global_load_tr_b64:
1397 case Intrinsic::amdgcn_global_load_tr_b128:
1398 case Intrinsic::amdgcn_ds_read_tr4_b64:
1399 case Intrinsic::amdgcn_ds_read_tr6_b96:
1400 case Intrinsic::amdgcn_ds_read_tr8_b64:
1401 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1403 Info.memVT = MVT::getVT(CI.getType());
1404 Info.ptrVal = CI.getOperand(0);
1405 Info.align.reset();
1407 return true;
1408 }
1409 case Intrinsic::amdgcn_ds_gws_init:
1410 case Intrinsic::amdgcn_ds_gws_barrier:
1411 case Intrinsic::amdgcn_ds_gws_sema_v:
1412 case Intrinsic::amdgcn_ds_gws_sema_br:
1413 case Intrinsic::amdgcn_ds_gws_sema_p:
1414 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1416
1417 const GCNTargetMachine &TM =
1418 static_cast<const GCNTargetMachine &>(getTargetMachine());
1419
1421 Info.ptrVal = MFI->getGWSPSV(TM);
1422
1423 // This is an abstract access, but we need to specify a type and size.
1424 Info.memVT = MVT::i32;
1425 Info.size = 4;
1426 Info.align = Align(4);
1427
1428 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1430 else
1432 return true;
1433 }
1434 case Intrinsic::amdgcn_global_load_lds: {
1436 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1437 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1438 Info.ptrVal = CI.getArgOperand(1);
1440 return true;
1441 }
1442 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1444
1445 const GCNTargetMachine &TM =
1446 static_cast<const GCNTargetMachine &>(getTargetMachine());
1447
1449 Info.ptrVal = MFI->getGWSPSV(TM);
1450
1451 // This is an abstract access, but we need to specify a type and size.
1452 Info.memVT = MVT::i32;
1453 Info.size = 4;
1454 Info.align = Align(4);
1455
1457 return true;
1458 }
1459 case Intrinsic::amdgcn_s_prefetch_data: {
1461 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1462 Info.ptrVal = CI.getArgOperand(0);
1464 return true;
1465 }
1466 default:
1467 return false;
1468 }
1469}
1470
1472 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1473 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1474 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1475 // The DAG's ValueType loses the addrspaces.
1476 // Add them as 2 extra Constant operands "from" and "to".
1477 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1478 unsigned DstAS = I.getType()->getPointerAddressSpace();
1479 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1480 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1481 break;
1482 }
1483 default:
1484 break;
1485 }
1486}
1487
1490 Type *&AccessTy) const {
1491 Value *Ptr = nullptr;
1492 switch (II->getIntrinsicID()) {
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume:
1496 case Intrinsic::amdgcn_ds_read_tr4_b64:
1497 case Intrinsic::amdgcn_ds_read_tr6_b96:
1498 case Intrinsic::amdgcn_ds_read_tr8_b64:
1499 case Intrinsic::amdgcn_ds_read_tr16_b64:
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap:
1502 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1503 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1504 case Intrinsic::amdgcn_global_atomic_csub:
1505 case Intrinsic::amdgcn_global_atomic_fmax_num:
1506 case Intrinsic::amdgcn_global_atomic_fmin_num:
1507 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1508 case Intrinsic::amdgcn_global_load_tr_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b128:
1510 Ptr = II->getArgOperand(0);
1511 break;
1512 case Intrinsic::amdgcn_global_load_lds:
1513 Ptr = II->getArgOperand(1);
1514 break;
1515 default:
1516 return false;
1517 }
1518 AccessTy = II->getType();
1519 Ops.push_back(Ptr);
1520 return true;
1521}
1522
1524 unsigned AddrSpace) const {
1525 if (!Subtarget->hasFlatInstOffsets()) {
1526 // Flat instructions do not have offsets, and only have the register
1527 // address.
1528 return AM.BaseOffs == 0 && AM.Scale == 0;
1529 }
1530
1531 decltype(SIInstrFlags::FLAT) FlatVariant =
1535
1536 return AM.Scale == 0 &&
1537 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1538 AM.BaseOffs, AddrSpace, FlatVariant));
1539}
1540
1542 if (Subtarget->hasFlatGlobalInsts())
1544
1545 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1546 // Assume the we will use FLAT for all global memory accesses
1547 // on VI.
1548 // FIXME: This assumption is currently wrong. On VI we still use
1549 // MUBUF instructions for the r + i addressing mode. As currently
1550 // implemented, the MUBUF instructions only work on buffer < 4GB.
1551 // It may be possible to support > 4GB buffers with MUBUF instructions,
1552 // by setting the stride value in the resource descriptor which would
1553 // increase the size limit to (stride * 4GB). However, this is risky,
1554 // because it has never been validated.
1556 }
1557
1558 return isLegalMUBUFAddressingMode(AM);
1559}
1560
1561bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1562 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1563 // additionally can do r + r + i with addr64. 32-bit has more addressing
1564 // mode options. Depending on the resource constant, it can also do
1565 // (i64 r0) + (i32 r1) * (i14 i).
1566 //
1567 // Private arrays end up using a scratch buffer most of the time, so also
1568 // assume those use MUBUF instructions. Scratch loads / stores are currently
1569 // implemented as mubuf instructions with offen bit set, so slightly
1570 // different than the normal addr64.
1571 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1572 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1573 return false;
1574
1575 // FIXME: Since we can split immediate into soffset and immediate offset,
1576 // would it make sense to allow any immediate?
1577
1578 switch (AM.Scale) {
1579 case 0: // r + i or just i, depending on HasBaseReg.
1580 return true;
1581 case 1:
1582 return true; // We have r + r or r + i.
1583 case 2:
1584 if (AM.HasBaseReg) {
1585 // Reject 2 * r + r.
1586 return false;
1587 }
1588
1589 // Allow 2 * r as r + r
1590 // Or 2 * r + i is allowed as r + r + i.
1591 return true;
1592 default: // Don't allow n * r
1593 return false;
1594 }
1595}
1596
1598 const AddrMode &AM, Type *Ty,
1599 unsigned AS,
1600 Instruction *I) const {
1601 // No global is ever allowed as a base.
1602 if (AM.BaseGV)
1603 return false;
1604
1605 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1606 return isLegalGlobalAddressingMode(AM);
1607
1608 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1612 // If the offset isn't a multiple of 4, it probably isn't going to be
1613 // correctly aligned.
1614 // FIXME: Can we get the real alignment here?
1615 if (AM.BaseOffs % 4 != 0)
1616 return isLegalMUBUFAddressingMode(AM);
1617
1618 if (!Subtarget->hasScalarSubwordLoads()) {
1619 // There are no SMRD extloads, so if we have to do a small type access we
1620 // will use a MUBUF load.
1621 // FIXME?: We also need to do this if unaligned, but we don't know the
1622 // alignment here.
1623 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1624 return isLegalGlobalAddressingMode(AM);
1625 }
1626
1628 // SMRD instructions have an 8-bit, dword offset on SI.
1629 if (!isUInt<8>(AM.BaseOffs / 4))
1630 return false;
1631 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1632 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1633 // in 8-bits, it can use a smaller encoding.
1634 if (!isUInt<32>(AM.BaseOffs / 4))
1635 return false;
1636 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1637 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1638 if (!isUInt<20>(AM.BaseOffs))
1639 return false;
1640 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1641 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1642 // for S_BUFFER_* instructions).
1643 if (!isInt<21>(AM.BaseOffs))
1644 return false;
1645 } else {
1646 // On GFX12, all offsets are signed 24-bit in bytes.
1647 if (!isInt<24>(AM.BaseOffs))
1648 return false;
1649 }
1650
1651 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1653 AM.BaseOffs < 0) {
1654 // Scalar (non-buffer) loads can only use a negative offset if
1655 // soffset+offset is non-negative. Since the compiler can only prove that
1656 // in a few special cases, it is safer to claim that negative offsets are
1657 // not supported.
1658 return false;
1659 }
1660
1661 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1662 return true;
1663
1664 if (AM.Scale == 1 && AM.HasBaseReg)
1665 return true;
1666
1667 return false;
1668 }
1669
1670 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1671 return Subtarget->enableFlatScratch()
1673 : isLegalMUBUFAddressingMode(AM);
1674
1675 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1676 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1677 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1678 // field.
1679 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1680 // an 8-bit dword offset but we don't know the alignment here.
1681 if (!isUInt<16>(AM.BaseOffs))
1682 return false;
1683
1684 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1685 return true;
1686
1687 if (AM.Scale == 1 && AM.HasBaseReg)
1688 return true;
1689
1690 return false;
1691 }
1692
1694 // For an unknown address space, this usually means that this is for some
1695 // reason being used for pure arithmetic, and not based on some addressing
1696 // computation. We don't have instructions that compute pointers with any
1697 // addressing modes, so treat them as having no offset like flat
1698 // instructions.
1700 }
1701
1702 // Assume a user alias of global for unknown address spaces.
1703 return isLegalGlobalAddressingMode(AM);
1704}
1705
1707 const MachineFunction &MF) const {
1709 return (MemVT.getSizeInBits() <= 4 * 32);
1710 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1711 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1712 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1713 }
1715 return (MemVT.getSizeInBits() <= 2 * 32);
1716 return true;
1717}
1718
1720 unsigned Size, unsigned AddrSpace, Align Alignment,
1721 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1722 if (IsFast)
1723 *IsFast = 0;
1724
1725 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1726 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1727 // Check if alignment requirements for ds_read/write instructions are
1728 // disabled.
1729 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1730 return false;
1731
1732 Align RequiredAlignment(
1733 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1734 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1735 Alignment < RequiredAlignment)
1736 return false;
1737
1738 // Either, the alignment requirements are "enabled", or there is an
1739 // unaligned LDS access related hardware bug though alignment requirements
1740 // are "disabled". In either case, we need to check for proper alignment
1741 // requirements.
1742 //
1743 switch (Size) {
1744 case 64:
1745 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1746 // address is negative, then the instruction is incorrectly treated as
1747 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1748 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1749 // load later in the SILoadStoreOptimizer.
1750 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1751 return false;
1752
1753 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1754 // can do a 4 byte aligned, 8 byte access in a single operation using
1755 // ds_read2/write2_b32 with adjacent offsets.
1756 RequiredAlignment = Align(4);
1757
1758 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1759 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1760 // ds_write2_b32 depending on the alignment. In either case with either
1761 // alignment there is no faster way of doing this.
1762
1763 // The numbers returned here and below are not additive, it is a 'speed
1764 // rank'. They are just meant to be compared to decide if a certain way
1765 // of lowering an operation is faster than another. For that purpose
1766 // naturally aligned operation gets it bitsize to indicate that "it
1767 // operates with a speed comparable to N-bit wide load". With the full
1768 // alignment ds128 is slower than ds96 for example. If underaligned it
1769 // is comparable to a speed of a single dword access, which would then
1770 // mean 32 < 128 and it is faster to issue a wide load regardless.
1771 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1772 // wider load which will not be aligned anymore the latter is slower.
1773 if (IsFast)
1774 *IsFast = (Alignment >= RequiredAlignment) ? 64
1775 : (Alignment < Align(4)) ? 32
1776 : 1;
1777 return true;
1778 }
1779
1780 break;
1781 case 96:
1782 if (!Subtarget->hasDS96AndDS128())
1783 return false;
1784
1785 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1786 // gfx8 and older.
1787
1788 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1789 // Naturally aligned access is fastest. However, also report it is Fast
1790 // if memory is aligned less than DWORD. A narrow load or store will be
1791 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1792 // be more of them, so overall we will pay less penalty issuing a single
1793 // instruction.
1794
1795 // See comment on the values above.
1796 if (IsFast)
1797 *IsFast = (Alignment >= RequiredAlignment) ? 96
1798 : (Alignment < Align(4)) ? 32
1799 : 1;
1800 return true;
1801 }
1802
1803 break;
1804 case 128:
1805 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1806 return false;
1807
1808 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1809 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1810 // single operation using ds_read2/write2_b64.
1811 RequiredAlignment = Align(8);
1812
1813 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1814 // Naturally aligned access is fastest. However, also report it is Fast
1815 // if memory is aligned less than DWORD. A narrow load or store will be
1816 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1817 // will be more of them, so overall we will pay less penalty issuing a
1818 // single instruction.
1819
1820 // See comment on the values above.
1821 if (IsFast)
1822 *IsFast = (Alignment >= RequiredAlignment) ? 128
1823 : (Alignment < Align(4)) ? 32
1824 : 1;
1825 return true;
1826 }
1827
1828 break;
1829 default:
1830 if (Size > 32)
1831 return false;
1832
1833 break;
1834 }
1835
1836 // See comment on the values above.
1837 // Note that we have a single-dword or sub-dword here, so if underaligned
1838 // it is a slowest possible access, hence returned value is 0.
1839 if (IsFast)
1840 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1841
1842 return Alignment >= RequiredAlignment ||
1843 Subtarget->hasUnalignedDSAccessEnabled();
1844 }
1845
1846 // FIXME: We have to be conservative here and assume that flat operations
1847 // will access scratch. If we had access to the IR function, then we
1848 // could determine if any private memory was used in the function.
1849 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1850 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1851 bool AlignedBy4 = Alignment >= Align(4);
1852 if (IsFast)
1853 *IsFast = AlignedBy4;
1854
1855 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1856 }
1857
1858 // So long as they are correct, wide global memory operations perform better
1859 // than multiple smaller memory ops -- even when misaligned
1860 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1861 if (IsFast)
1862 *IsFast = Size;
1863
1864 return Alignment >= Align(4) ||
1866 }
1867
1868 // Smaller than dword value must be aligned.
1869 if (Size < 32)
1870 return false;
1871
1872 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1873 // byte-address are ignored, thus forcing Dword alignment.
1874 // This applies to private, global, and constant memory.
1875 if (IsFast)
1876 *IsFast = 1;
1877
1878 return Size >= 32 && Alignment >= Align(4);
1879}
1880
1882 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1883 unsigned *IsFast) const {
1885 Alignment, Flags, IsFast);
1886}
1887
1889 const MemOp &Op, const AttributeList &FuncAttributes) const {
1890 // FIXME: Should account for address space here.
1891
1892 // The default fallback uses the private pointer size as a guess for a type to
1893 // use. Make sure we switch these to 64-bit accesses.
1894
1895 if (Op.size() >= 16 &&
1896 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1897 return MVT::v4i32;
1898
1899 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1900 return MVT::v2i32;
1901
1902 // Use the default.
1903 return MVT::Other;
1904}
1905
1907 const MemSDNode *MemNode = cast<MemSDNode>(N);
1908 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1909}
1910
1912 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1914}
1915
1917 unsigned DestAS) const {
1918 // Flat -> private/local is a simple truncate.
1919 // Flat -> global is no-op
1920 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1921 return true;
1922
1923 const GCNTargetMachine &TM =
1924 static_cast<const GCNTargetMachine &>(getTargetMachine());
1925 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1926}
1927
1930 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1931 VT.getScalarType().bitsLE(MVT::i16))
1934}
1935
1937 Type *Ty) const {
1938 // FIXME: Could be smarter if called for vector constants.
1939 return true;
1940}
1941
1943 unsigned Index) const {
1945 return false;
1946
1947 // TODO: Add more cases that are cheap.
1948 return Index == 0;
1949}
1950
1952 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1953 switch (Op) {
1954 case ISD::LOAD:
1955 case ISD::STORE:
1956 return true;
1957 default:
1958 return false;
1959 }
1960 }
1961
1962 // SimplifySetCC uses this function to determine whether or not it should
1963 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1964 if (VT == MVT::i1 && Op == ISD::SETCC)
1965 return false;
1966
1968}
1969
1970SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1971 const SDLoc &SL,
1972 SDValue Chain,
1973 uint64_t Offset) const {
1974 const DataLayout &DL = DAG.getDataLayout();
1978
1979 auto [InputPtrReg, RC, ArgTy] =
1981
1982 // We may not have the kernarg segment argument if we have no kernel
1983 // arguments.
1984 if (!InputPtrReg)
1985 return DAG.getConstant(Offset, SL, PtrVT);
1986
1988 SDValue BasePtr = DAG.getCopyFromReg(
1989 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1990
1991 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1992}
1993
1994SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1995 const SDLoc &SL) const {
1998 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1999}
2000
2001SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2002 const SDLoc &SL) const {
2003
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2008 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2009 return SDValue();
2010}
2011
2012SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2013 const SDLoc &SL, SDValue Val,
2014 bool Signed,
2015 const ISD::InputArg *Arg) const {
2016 // First, if it is a widened vector, narrow it.
2017 if (VT.isVector() &&
2019 EVT NarrowedVT =
2022 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2023 DAG.getConstant(0, SL, MVT::i32));
2024 }
2025
2026 // Then convert the vector elements or scalar value.
2027 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2028 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2029 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2030 }
2031
2032 if (MemVT.isFloatingPoint())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2034 else if (Signed)
2035 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2036 else
2037 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2038
2039 return Val;
2040}
2041
2042SDValue SITargetLowering::lowerKernargMemParameter(
2043 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2044 uint64_t Offset, Align Alignment, bool Signed,
2045 const ISD::InputArg *Arg) const {
2047
2048 // Try to avoid using an extload by loading earlier than the argument address,
2049 // and extracting the relevant bits. The load should hopefully be merged with
2050 // the previous argument.
2051 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2052 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2053 int64_t AlignDownOffset = alignDown(Offset, 4);
2054 int64_t OffsetDiff = Offset - AlignDownOffset;
2055
2056 EVT IntVT = MemVT.changeTypeToInteger();
2057
2058 // TODO: If we passed in the base kernel offset we could have a better
2059 // alignment than 4, but we don't really need it.
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2061 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2064
2065 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2066 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2067
2068 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2069 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2071
2072 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2073 }
2074
2075 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2076 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2079
2080 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2081 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2082}
2083
2084SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2085 CCValAssign &VA, const SDLoc &SL,
2086 SDValue Chain,
2087 const ISD::InputArg &Arg) const {
2089 MachineFrameInfo &MFI = MF.getFrameInfo();
2090
2091 if (Arg.Flags.isByVal()) {
2092 unsigned Size = Arg.Flags.getByValSize();
2093 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2094 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2095 }
2096
2097 unsigned ArgOffset = VA.getLocMemOffset();
2098 unsigned ArgSize = VA.getValVT().getStoreSize();
2099
2100 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2101
2102 // Create load nodes to retrieve arguments from the stack.
2103 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2104 SDValue ArgValue;
2105
2106 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2108 MVT MemVT = VA.getValVT();
2109
2110 switch (VA.getLocInfo()) {
2111 default:
2112 break;
2113 case CCValAssign::BCvt:
2114 MemVT = VA.getLocVT();
2115 break;
2116 case CCValAssign::SExt:
2117 ExtType = ISD::SEXTLOAD;
2118 break;
2119 case CCValAssign::ZExt:
2120 ExtType = ISD::ZEXTLOAD;
2121 break;
2122 case CCValAssign::AExt:
2123 ExtType = ISD::EXTLOAD;
2124 break;
2125 }
2126
2127 ArgValue = DAG.getExtLoad(
2128 ExtType, SL, VA.getLocVT(), Chain, FIN,
2130 return ArgValue;
2131}
2132
2133SDValue SITargetLowering::getPreloadedValue(
2134 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2136 const ArgDescriptor *Reg = nullptr;
2137 const TargetRegisterClass *RC;
2138 LLT Ty;
2139
2141 const ArgDescriptor WorkGroupIDX =
2142 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2143 // If GridZ is not programmed in an entry function then the hardware will set
2144 // it to all zeros, so there is no need to mask the GridY value in the low
2145 // order bits.
2146 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2147 AMDGPU::TTMP7,
2148 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2149 const ArgDescriptor WorkGroupIDZ =
2150 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2151 if (Subtarget->hasArchitectedSGPRs() &&
2153 switch (PVID) {
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2157 Ty = LLT::scalar(32);
2158 break;
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2162 Ty = LLT::scalar(32);
2163 break;
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2167 Ty = LLT::scalar(32);
2168 break;
2169 default:
2170 break;
2171 }
2172 }
2173
2174 if (!Reg)
2175 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2176 if (!Reg) {
2178 // It's possible for a kernarg intrinsic call to appear in a kernel with
2179 // no allocated segment, in which case we do not add the user sgpr
2180 // argument, so just return null.
2181 return DAG.getConstant(0, SDLoc(), VT);
2182 }
2183
2184 // It's undefined behavior if a function marked with the amdgpu-no-*
2185 // attributes uses the corresponding intrinsic.
2186 return DAG.getUNDEF(VT);
2187 }
2188
2189 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2190}
2191
2193 CallingConv::ID CallConv,
2194 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2195 FunctionType *FType,
2196 SIMachineFunctionInfo *Info) {
2197 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2198 const ISD::InputArg *Arg = &Ins[I];
2199
2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2201 "vector type argument should have been split");
2202
2203 // First check if it's a PS input addr.
2204 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2205 PSInputNum <= 15) {
2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2207
2208 // Inconveniently only the first part of the split is marked as isSplit,
2209 // so skip to the end. We only want to increment PSInputNum once for the
2210 // entire split argument.
2211 if (Arg->Flags.isSplit()) {
2212 while (!Arg->Flags.isSplitEnd()) {
2213 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2214 "unexpected vector split in ps argument type");
2215 if (!SkipArg)
2216 Splits.push_back(*Arg);
2217 Arg = &Ins[++I];
2218 }
2219 }
2220
2221 if (SkipArg) {
2222 // We can safely skip PS inputs.
2223 Skipped.set(Arg->getOrigArgIndex());
2224 ++PSInputNum;
2225 continue;
2226 }
2227
2228 Info->markPSInputAllocated(PSInputNum);
2229 if (Arg->Used)
2230 Info->markPSInputEnabled(PSInputNum);
2231
2232 ++PSInputNum;
2233 }
2234
2235 Splits.push_back(*Arg);
2236 }
2237}
2238
2239// Allocate special inputs passed in VGPRs.
2241 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2242 SIMachineFunctionInfo &Info) const {
2243 const LLT S32 = LLT::scalar(32);
2245
2246 if (Info.hasWorkItemIDX()) {
2247 Register Reg = AMDGPU::VGPR0;
2248 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2249
2250 CCInfo.AllocateReg(Reg);
2251 unsigned Mask =
2252 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2253 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2254 }
2255
2256 if (Info.hasWorkItemIDY()) {
2257 assert(Info.hasWorkItemIDX());
2258 if (Subtarget->hasPackedTID()) {
2259 Info.setWorkItemIDY(
2260 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2261 } else {
2262 unsigned Reg = AMDGPU::VGPR1;
2263 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2264
2265 CCInfo.AllocateReg(Reg);
2266 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2267 }
2268 }
2269
2270 if (Info.hasWorkItemIDZ()) {
2271 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2272 if (Subtarget->hasPackedTID()) {
2273 Info.setWorkItemIDZ(
2274 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2275 } else {
2276 unsigned Reg = AMDGPU::VGPR2;
2277 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2278
2279 CCInfo.AllocateReg(Reg);
2280 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2281 }
2282 }
2283}
2284
2285// Try to allocate a VGPR at the end of the argument list, or if no argument
2286// VGPRs are left allocating a stack slot.
2287// If \p Mask is is given it indicates bitfield position in the register.
2288// If \p Arg is given use it with new ]p Mask instead of allocating new.
2289static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2290 ArgDescriptor Arg = ArgDescriptor()) {
2291 if (Arg.isSet())
2292 return ArgDescriptor::createArg(Arg, Mask);
2293
2294 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2295 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2296 if (RegIdx == ArgVGPRs.size()) {
2297 // Spill to stack required.
2298 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2299
2300 return ArgDescriptor::createStack(Offset, Mask);
2301 }
2302
2303 unsigned Reg = ArgVGPRs[RegIdx];
2304 Reg = CCInfo.AllocateReg(Reg);
2305 assert(Reg != AMDGPU::NoRegister);
2306
2307 MachineFunction &MF = CCInfo.getMachineFunction();
2308 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2309 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2310 return ArgDescriptor::createRegister(Reg, Mask);
2311}
2312
2314 const TargetRegisterClass *RC,
2315 unsigned NumArgRegs) {
2316 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2317 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2318 if (RegIdx == ArgSGPRs.size())
2319 report_fatal_error("ran out of SGPRs for arguments");
2320
2321 unsigned Reg = ArgSGPRs[RegIdx];
2322 Reg = CCInfo.AllocateReg(Reg);
2323 assert(Reg != AMDGPU::NoRegister);
2324
2325 MachineFunction &MF = CCInfo.getMachineFunction();
2326 MF.addLiveIn(Reg, RC);
2328}
2329
2330// If this has a fixed position, we still should allocate the register in the
2331// CCInfo state. Technically we could get away with this for values passed
2332// outside of the normal argument range.
2334 const TargetRegisterClass *RC,
2335 MCRegister Reg) {
2336 Reg = CCInfo.AllocateReg(Reg);
2337 assert(Reg != AMDGPU::NoRegister);
2338 MachineFunction &MF = CCInfo.getMachineFunction();
2339 MF.addLiveIn(Reg, RC);
2340}
2341
2342static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2343 if (Arg) {
2344 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2345 Arg.getRegister());
2346 } else
2347 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2348}
2349
2350static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2351 if (Arg) {
2352 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2353 Arg.getRegister());
2354 } else
2355 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2356}
2357
2358/// Allocate implicit function VGPR arguments at the end of allocated user
2359/// arguments.
2361 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2362 SIMachineFunctionInfo &Info) const {
2363 const unsigned Mask = 0x3ff;
2364 ArgDescriptor Arg;
2365
2366 if (Info.hasWorkItemIDX()) {
2367 Arg = allocateVGPR32Input(CCInfo, Mask);
2368 Info.setWorkItemIDX(Arg);
2369 }
2370
2371 if (Info.hasWorkItemIDY()) {
2372 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2373 Info.setWorkItemIDY(Arg);
2374 }
2375
2376 if (Info.hasWorkItemIDZ())
2377 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2378}
2379
2380/// Allocate implicit function VGPR arguments in fixed registers.
2382 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2383 SIMachineFunctionInfo &Info) const {
2384 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2385 if (!Reg)
2386 report_fatal_error("failed to allocated VGPR for implicit arguments");
2387
2388 const unsigned Mask = 0x3ff;
2389 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2390 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2391 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2392}
2393
2395 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2396 SIMachineFunctionInfo &Info) const {
2397 auto &ArgInfo = Info.getArgInfo();
2398 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2399
2400 // TODO: Unify handling with private memory pointers.
2401 if (UserSGPRInfo.hasDispatchPtr())
2402 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2403
2404 if (UserSGPRInfo.hasQueuePtr())
2405 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2406
2407 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2408 // constant offset from the kernarg segment.
2409 if (Info.hasImplicitArgPtr())
2410 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2411
2412 if (UserSGPRInfo.hasDispatchID())
2413 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2414
2415 // flat_scratch_init is not applicable for non-kernel functions.
2416
2417 if (Info.hasWorkGroupIDX())
2418 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2419
2420 if (Info.hasWorkGroupIDY())
2421 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2422
2423 if (Info.hasWorkGroupIDZ())
2424 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2425
2426 if (Info.hasLDSKernelId())
2427 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2428}
2429
2430// Allocate special inputs passed in user SGPRs.
2432 MachineFunction &MF,
2433 const SIRegisterInfo &TRI,
2434 SIMachineFunctionInfo &Info) const {
2435 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2436 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2437 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2438 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2439 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2440 }
2441
2442 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2443 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2444 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2445 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2446 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2447 }
2448
2449 if (UserSGPRInfo.hasDispatchPtr()) {
2450 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2451 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2452 CCInfo.AllocateReg(DispatchPtrReg);
2453 }
2454
2455 if (UserSGPRInfo.hasQueuePtr()) {
2456 Register QueuePtrReg = Info.addQueuePtr(TRI);
2457 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2458 CCInfo.AllocateReg(QueuePtrReg);
2459 }
2460
2461 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2463 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2464 CCInfo.AllocateReg(InputPtrReg);
2465
2466 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2467 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2468 }
2469
2470 if (UserSGPRInfo.hasDispatchID()) {
2471 Register DispatchIDReg = Info.addDispatchID(TRI);
2472 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2473 CCInfo.AllocateReg(DispatchIDReg);
2474 }
2475
2476 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2477 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2478 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2479 CCInfo.AllocateReg(FlatScratchInitReg);
2480 }
2481
2482 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2483 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2484 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2485 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2486 }
2487
2488 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2489 // these from the dispatch pointer.
2490}
2491
2492// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2493// sequential starting from the first argument.
2495 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2497 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2498 Function &F = MF.getFunction();
2499 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2500 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2501 bool InPreloadSequence = true;
2502 unsigned InIdx = 0;
2503 bool AlignedForImplictArgs = false;
2504 unsigned ImplicitArgOffset = 0;
2505 for (auto &Arg : F.args()) {
2506 if (!InPreloadSequence || !Arg.hasInRegAttr())
2507 break;
2508
2509 unsigned ArgIdx = Arg.getArgNo();
2510 // Don't preload non-original args or parts not in the current preload
2511 // sequence.
2512 if (InIdx < Ins.size() &&
2513 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2514 break;
2515
2516 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2517 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2518 InIdx++) {
2519 assert(ArgLocs[ArgIdx].isMemLoc());
2520 auto &ArgLoc = ArgLocs[InIdx];
2521 const Align KernelArgBaseAlign = Align(16);
2522 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2523 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2524 unsigned NumAllocSGPRs =
2525 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2526
2527 // Fix alignment for hidden arguments.
2528 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2529 if (!AlignedForImplictArgs) {
2530 ImplicitArgOffset =
2531 alignTo(LastExplicitArgOffset,
2532 Subtarget->getAlignmentForImplicitArgPtr()) -
2533 LastExplicitArgOffset;
2534 AlignedForImplictArgs = true;
2535 }
2536 ArgOffset += ImplicitArgOffset;
2537 }
2538
2539 // Arg is preloaded into the previous SGPR.
2540 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2541 assert(InIdx >= 1 && "No previous SGPR");
2542 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2543 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2544 continue;
2545 }
2546
2547 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2548 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2549 // Check for free user SGPRs for preloading.
2550 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2551 InPreloadSequence = false;
2552 break;
2553 }
2554
2555 // Preload this argument.
2556 const TargetRegisterClass *RC =
2557 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2558 SmallVectorImpl<MCRegister> *PreloadRegs =
2559 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2560
2561 if (PreloadRegs->size() > 1)
2562 RC = &AMDGPU::SGPR_32RegClass;
2563 for (auto &Reg : *PreloadRegs) {
2564 assert(Reg);
2565 MF.addLiveIn(Reg, RC);
2566 CCInfo.AllocateReg(Reg);
2567 }
2568
2569 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2570 }
2571 }
2572}
2573
2575 const SIRegisterInfo &TRI,
2576 SIMachineFunctionInfo &Info) const {
2577 // Always allocate this last since it is a synthetic preload.
2578 if (Info.hasLDSKernelId()) {
2579 Register Reg = Info.addLDSKernelId();
2580 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2581 CCInfo.AllocateReg(Reg);
2582 }
2583}
2584
2585// Allocate special input registers that are initialized per-wave.
2588 CallingConv::ID CallConv,
2589 bool IsShader) const {
2590 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2591 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2592 // Note: user SGPRs are handled by the front-end for graphics shaders
2593 // Pad up the used user SGPRs with dead inputs.
2594
2595 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2596 // before enabling architected SGPRs for workgroup IDs.
2597 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2598
2599 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2600 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2601 // rely on it to reach 16 since if we end up having no stack usage, it will
2602 // not really be added.
2603 unsigned NumRequiredSystemSGPRs =
2604 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2605 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2606 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2607 Register Reg = Info.addReservedUserSGPR();
2608 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2609 CCInfo.AllocateReg(Reg);
2610 }
2611 }
2612
2613 if (!HasArchitectedSGPRs) {
2614 if (Info.hasWorkGroupIDX()) {
2615 Register Reg = Info.addWorkGroupIDX();
2616 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2617 CCInfo.AllocateReg(Reg);
2618 }
2619
2620 if (Info.hasWorkGroupIDY()) {
2621 Register Reg = Info.addWorkGroupIDY();
2622 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.AllocateReg(Reg);
2624 }
2625
2626 if (Info.hasWorkGroupIDZ()) {
2627 Register Reg = Info.addWorkGroupIDZ();
2628 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2629 CCInfo.AllocateReg(Reg);
2630 }
2631 }
2632
2633 if (Info.hasWorkGroupInfo()) {
2634 Register Reg = Info.addWorkGroupInfo();
2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636 CCInfo.AllocateReg(Reg);
2637 }
2638
2639 if (Info.hasPrivateSegmentWaveByteOffset()) {
2640 // Scratch wave offset passed in system SGPR.
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2642
2643 if (IsShader) {
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2646
2647 // This is true if the scratch wave byte offset doesn't have a fixed
2648 // location.
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2650 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2652 }
2653 } else
2654 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2655
2656 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2658 }
2659
2660 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2661 Info.getNumPreloadedSGPRs() >= 16);
2662}
2663
2665 MachineFunction &MF,
2666 const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) {
2668 // Now that we've figured out where the scratch register inputs are, see if
2669 // should reserve the arguments and use them directly.
2670 MachineFrameInfo &MFI = MF.getFrameInfo();
2671 bool HasStackObjects = MFI.hasStackObjects();
2672 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2673
2674 // Record that we know we have non-spill stack objects so we don't need to
2675 // check all stack objects later.
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(true);
2678
2679 // Everything live out of a block is spilled with fast regalloc, so it's
2680 // almost certain that spilling will be required.
2681 if (TM.getOptLevel() == CodeGenOptLevel::None)
2682 HasStackObjects = true;
2683
2684 // For now assume stack access is needed in any callee functions, so we need
2685 // the scratch registers to pass in.
2686 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2687
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2690 // If we have stack objects, we unquestionably need the private buffer
2691 // resource. For the Code Object V2 ABI, this will be the first 4 user
2692 // SGPR inputs. We can reserve those and use them directly.
2693
2694 Register PrivateSegmentBufferReg =
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2697 } else {
2698 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2699 // We tentatively reserve the last registers (skipping the last registers
2700 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2701 // we'll replace these with the ones immediately after those which were
2702 // really allocated. In the prologue copies will be inserted from the
2703 // argument to these reserved registers.
2704
2705 // Without HSA, relocations are used for the scratch pointer and the
2706 // buffer resource setup is always inserted in the prologue. Scratch wave
2707 // offset is still in an input SGPR.
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2709 }
2710 }
2711
2713
2714 // For entry functions we have to set up the stack pointer if we use it,
2715 // whereas non-entry functions get this "for free". This means there is no
2716 // intrinsic advantage to using S32 over S34 in cases where we do not have
2717 // calls but do need a frame pointer (i.e. if we are requested to have one
2718 // because frame pointer elimination is disabled). To keep things simple we
2719 // only ever use S32 as the call ABI stack pointer, and so using it does not
2720 // imply we need a separate frame pointer.
2721 //
2722 // Try to use s32 as the SP, but move it if it would interfere with input
2723 // arguments. This won't work with calls though.
2724 //
2725 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2726 // registers.
2727 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2729 } else {
2731
2732 if (MFI.hasCalls())
2733 report_fatal_error("call in graphics shader with too many input SGPRs");
2734
2735 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2738 break;
2739 }
2740 }
2741
2742 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2743 report_fatal_error("failed to find register for SP");
2744 }
2745
2746 // hasFP should be accurate for entry functions even before the frame is
2747 // finalized, because it does not rely on the known stack size, only
2748 // properties like whether variable sized objects are present.
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2751 }
2752}
2753
2756 return !Info->isEntryFunction();
2757}
2758
2760
2762 MachineBasicBlock *Entry,
2763 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2765
2766 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2767 if (!IStart)
2768 return;
2769
2770 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2771 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2772 MachineBasicBlock::iterator MBBI = Entry->begin();
2773 for (const MCPhysReg *I = IStart; *I; ++I) {
2774 const TargetRegisterClass *RC = nullptr;
2775 if (AMDGPU::SReg_64RegClass.contains(*I))
2776 RC = &AMDGPU::SGPR_64RegClass;
2777 else if (AMDGPU::SReg_32RegClass.contains(*I))
2778 RC = &AMDGPU::SGPR_32RegClass;
2779 else
2780 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2781
2782 Register NewVR = MRI->createVirtualRegister(RC);
2783 // Create copy from CSR to a virtual register.
2784 Entry->addLiveIn(*I);
2785 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2786 .addReg(*I);
2787
2788 // Insert the copy-back instructions right before the terminator.
2789 for (auto *Exit : Exits)
2790 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2791 TII->get(TargetOpcode::COPY), *I)
2792 .addReg(NewVR);
2793 }
2794}
2795
2797 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2798 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2799 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2801
2803 const Function &Fn = MF.getFunction();
2806
2807 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2808 DiagnosticInfoUnsupported NoGraphicsHSA(
2809 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2810 DAG.getContext()->diagnose(NoGraphicsHSA);
2811 return DAG.getEntryNode();
2812 }
2813
2816 BitVector Skipped(Ins.size());
2817 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2818 *DAG.getContext());
2819
2820 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2821 bool IsKernel = AMDGPU::isKernel(CallConv);
2822 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2823
2824 if (IsGraphics) {
2825 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2826 assert(!UserSGPRInfo.hasDispatchPtr() &&
2827 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2828 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2829 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2830 (void)UserSGPRInfo;
2831 if (!Subtarget->enableFlatScratch())
2832 assert(!UserSGPRInfo.hasFlatScratchInit());
2833 if ((CallConv != CallingConv::AMDGPU_CS &&
2834 CallConv != CallingConv::AMDGPU_Gfx) ||
2835 !Subtarget->hasArchitectedSGPRs())
2836 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2837 !Info->hasWorkGroupIDZ());
2838 }
2839
2840 if (CallConv == CallingConv::AMDGPU_PS) {
2841 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2842
2843 // At least one interpolation mode must be enabled or else the GPU will
2844 // hang.
2845 //
2846 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2847 // set PSInputAddr, the user wants to enable some bits after the compilation
2848 // based on run-time states. Since we can't know what the final PSInputEna
2849 // will look like, so we shouldn't do anything here and the user should take
2850 // responsibility for the correct programming.
2851 //
2852 // Otherwise, the following restrictions apply:
2853 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2854 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2855 // enabled too.
2856 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2857 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2858 CCInfo.AllocateReg(AMDGPU::VGPR0);
2859 CCInfo.AllocateReg(AMDGPU::VGPR1);
2860 Info->markPSInputAllocated(0);
2861 Info->markPSInputEnabled(0);
2862 }
2863 if (Subtarget->isAmdPalOS()) {
2864 // For isAmdPalOS, the user does not enable some bits after compilation
2865 // based on run-time states; the register values being generated here are
2866 // the final ones set in hardware. Therefore we need to apply the
2867 // workaround to PSInputAddr and PSInputEnable together. (The case where
2868 // a bit is set in PSInputAddr but not PSInputEnable is where the
2869 // frontend set up an input arg for a particular interpolation mode, but
2870 // nothing uses that input arg. Really we should have an earlier pass
2871 // that removes such an arg.)
2872 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2873 if ((PsInputBits & 0x7F) == 0 ||
2874 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2875 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2876 }
2877 } else if (IsKernel) {
2878 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2879 } else {
2880 Splits.append(Ins.begin(), Ins.end());
2881 }
2882
2883 if (IsKernel)
2884 analyzeFormalArgumentsCompute(CCInfo, Ins);
2885
2886 if (IsEntryFunc) {
2887 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2888 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2889 if (IsKernel && Subtarget->hasKernargPreload())
2890 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2891
2892 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2893 } else if (!IsGraphics) {
2894 // For the fixed ABI, pass workitem IDs in the last argument register.
2895 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2896
2897 // FIXME: Sink this into allocateSpecialInputSGPRs
2898 if (!Subtarget->enableFlatScratch())
2899 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2900
2901 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2902 }
2903
2904 if (!IsKernel) {
2905 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2906 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2907 }
2908
2910
2911 // FIXME: This is the minimum kernel argument alignment. We should improve
2912 // this to the maximum alignment of the arguments.
2913 //
2914 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2915 // kern arg offset.
2916 const Align KernelArgBaseAlign = Align(16);
2917
2918 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2919 const ISD::InputArg &Arg = Ins[i];
2920 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2921 InVals.push_back(DAG.getUNDEF(Arg.VT));
2922 continue;
2923 }
2924
2925 CCValAssign &VA = ArgLocs[ArgIdx++];
2926 MVT VT = VA.getLocVT();
2927
2928 if (IsEntryFunc && VA.isMemLoc()) {
2929 VT = Ins[i].VT;
2930 EVT MemVT = VA.getLocVT();
2931
2932 const uint64_t Offset = VA.getLocMemOffset();
2933 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2934
2935 if (Arg.Flags.isByRef()) {
2936 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2937
2938 const GCNTargetMachine &TM =
2939 static_cast<const GCNTargetMachine &>(getTargetMachine());
2940 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2941 Arg.Flags.getPointerAddrSpace())) {
2944 }
2945
2946 InVals.push_back(Ptr);
2947 continue;
2948 }
2949
2950 SDValue NewArg;
2951 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2952 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2953 // In this case the argument is packed into the previous preload SGPR.
2954 int64_t AlignDownOffset = alignDown(Offset, 4);
2955 int64_t OffsetDiff = Offset - AlignDownOffset;
2956 EVT IntVT = MemVT.changeTypeToInteger();
2957
2961 Register Reg =
2962 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2963
2964 assert(Reg);
2965 Register VReg = MRI.getLiveInVirtReg(Reg);
2966 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2967
2968 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2969 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2970
2971 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2972 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2973 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2974 Ins[i].Flags.isSExt(), &Ins[i]);
2975
2976 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2977 } else {
2981 const SmallVectorImpl<MCRegister> &PreloadRegs =
2982 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2983
2984 SDValue Copy;
2985 if (PreloadRegs.size() == 1) {
2986 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2987 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2988 NewArg = DAG.getCopyFromReg(
2989 Chain, DL, VReg,
2991 TRI->getRegSizeInBits(*RC)));
2992
2993 } else {
2994 // If the kernarg alignment does not match the alignment of the SGPR
2995 // tuple RC that can accommodate this argument, it will be built up
2996 // via copies from from the individual SGPRs that the argument was
2997 // preloaded to.
2999 for (auto Reg : PreloadRegs) {
3000 Register VReg = MRI.getLiveInVirtReg(Reg);
3001 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3002 Elts.push_back(Copy);
3003 }
3004 NewArg =
3005 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3006 PreloadRegs.size()),
3007 DL, Elts);
3008 }
3009
3010 // If the argument was preloaded to multiple consecutive 32-bit
3011 // registers because of misalignment between addressable SGPR tuples
3012 // and the argument size, we can still assume that because of kernarg
3013 // segment alignment restrictions that NewArg's size is the same as
3014 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3015 // truncate since we cannot preload to less than a single SGPR and the
3016 // MemVT may be smaller.
3017 EVT MemVTInt =
3019 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3020 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3021
3022 NewArg = DAG.getBitcast(MemVT, NewArg);
3023 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3024 Ins[i].Flags.isSExt(), &Ins[i]);
3025 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3026 }
3027 } else {
3028 // Hidden arguments that are in the kernel signature must be preloaded
3029 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3030 // the argument list and is not preloaded.
3031 if (Arg.isOrigArg()) {
3032 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3033 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3034 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3035 *OrigArg->getParent(),
3036 "hidden argument in kernel signature was not preloaded",
3037 DL.getDebugLoc());
3038 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3039 }
3040 }
3041
3042 NewArg =
3043 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3044 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3045 }
3046 Chains.push_back(NewArg.getValue(1));
3047
3048 auto *ParamTy =
3049 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3051 ParamTy &&
3052 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3053 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3054 // On SI local pointers are just offsets into LDS, so they are always
3055 // less than 16-bits. On CI and newer they could potentially be
3056 // real pointers, so we can't guarantee their size.
3057 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3058 DAG.getValueType(MVT::i16));
3059 }
3060
3061 InVals.push_back(NewArg);
3062 continue;
3063 }
3064 if (!IsEntryFunc && VA.isMemLoc()) {
3065 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3066 InVals.push_back(Val);
3067 if (!Arg.Flags.isByVal())
3068 Chains.push_back(Val.getValue(1));
3069 continue;
3070 }
3071
3072 assert(VA.isRegLoc() && "Parameter must be in a register!");
3073
3074 Register Reg = VA.getLocReg();
3075 const TargetRegisterClass *RC = nullptr;
3076 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3077 RC = &AMDGPU::VGPR_32RegClass;
3078 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3079 RC = &AMDGPU::SGPR_32RegClass;
3080 else
3081 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3082 EVT ValVT = VA.getValVT();
3083
3084 Reg = MF.addLiveIn(Reg, RC);
3085 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3086
3087 if (Arg.Flags.isSRet()) {
3088 // The return object should be reasonably addressable.
3089
3090 // FIXME: This helps when the return is a real sret. If it is a
3091 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3092 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3093 unsigned NumBits =
3095 Val = DAG.getNode(
3096 ISD::AssertZext, DL, VT, Val,
3097 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3098 }
3099
3100 // If this is an 8 or 16-bit value, it is really passed promoted
3101 // to 32 bits. Insert an assert[sz]ext to capture this, then
3102 // truncate to the right size.
3103 switch (VA.getLocInfo()) {
3104 case CCValAssign::Full:
3105 break;
3106 case CCValAssign::BCvt:
3107 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3108 break;
3109 case CCValAssign::SExt:
3110 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3111 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3112 break;
3113 case CCValAssign::ZExt:
3114 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3115 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3116 break;
3117 case CCValAssign::AExt:
3118 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3119 break;
3120 default:
3121 llvm_unreachable("Unknown loc info!");
3122 }
3123
3124 InVals.push_back(Val);
3125 }
3126
3127 // Start adding system SGPRs.
3128 if (IsEntryFunc)
3129 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3130
3131 // DAG.getPass() returns nullptr when using new pass manager.
3132 // TODO: Use DAG.getMFAM() to access analysis result.
3133 if (DAG.getPass()) {
3134 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3135 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3136 }
3137
3138 unsigned StackArgSize = CCInfo.getStackSize();
3139 Info->setBytesInStackArgArea(StackArgSize);
3140
3141 return Chains.empty() ? Chain
3142 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3143}
3144
3145// TODO: If return values can't fit in registers, we should return as many as
3146// possible in registers before passing on stack.
3148 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3149 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3150 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3151 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3152 // for shaders. Vector types should be explicitly handled by CC.
3153 if (AMDGPU::isEntryFunctionCC(CallConv))
3154 return true;
3155
3157 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3158 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3159 return false;
3160
3161 // We must use the stack if return would require unavailable registers.
3162 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3163 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3164 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3165 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3166 return false;
3167
3168 return true;
3169}
3170
3171SDValue
3173 bool isVarArg,
3175 const SmallVectorImpl<SDValue> &OutVals,
3176 const SDLoc &DL, SelectionDAG &DAG) const {
3179
3180 if (AMDGPU::isKernel(CallConv)) {
3181 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3182 OutVals, DL, DAG);
3183 }
3184
3185 bool IsShader = AMDGPU::isShader(CallConv);
3186
3187 Info->setIfReturnsVoid(Outs.empty());
3188 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3189
3190 // CCValAssign - represent the assignment of the return value to a location.
3193
3194 // CCState - Info about the registers and stack slots.
3195 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3196 *DAG.getContext());
3197
3198 // Analyze outgoing return values.
3199 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3200
3201 SDValue Glue;
3203 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3204
3205 // Copy the result values into the output registers.
3206 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3207 ++I, ++RealRVLocIdx) {
3208 CCValAssign &VA = RVLocs[I];
3209 assert(VA.isRegLoc() && "Can only return in registers!");
3210 // TODO: Partially return in registers if return values don't fit.
3211 SDValue Arg = OutVals[RealRVLocIdx];
3212
3213 // Copied from other backends.
3214 switch (VA.getLocInfo()) {
3215 case CCValAssign::Full:
3216 break;
3217 case CCValAssign::BCvt:
3218 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3219 break;
3220 case CCValAssign::SExt:
3221 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3222 break;
3223 case CCValAssign::ZExt:
3224 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3225 break;
3226 case CCValAssign::AExt:
3227 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3228 break;
3229 default:
3230 llvm_unreachable("Unknown loc info!");
3231 }
3232
3233 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3234 Glue = Chain.getValue(1);
3235 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3236 }
3237
3238 // FIXME: Does sret work properly?
3239 if (!Info->isEntryFunction()) {
3240 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3241 const MCPhysReg *I =
3242 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3243 if (I) {
3244 for (; *I; ++I) {
3245 if (AMDGPU::SReg_64RegClass.contains(*I))
3246 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3247 else if (AMDGPU::SReg_32RegClass.contains(*I))
3248 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3249 else
3250 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3251 }
3252 }
3253 }
3254
3255 // Update chain and glue.
3256 RetOps[0] = Chain;
3257 if (Glue.getNode())
3258 RetOps.push_back(Glue);
3259
3260 unsigned Opc = AMDGPUISD::ENDPGM;
3261 if (!IsWaveEnd)
3263 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3264}
3265
3267 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3268 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3269 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3270 SDValue ThisVal) const {
3271 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3272
3273 // Assign locations to each value returned by this call.
3275 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3276 *DAG.getContext());
3277 CCInfo.AnalyzeCallResult(Ins, RetCC);
3278
3279 // Copy all of the result registers out of their specified physreg.
3280 for (CCValAssign VA : RVLocs) {
3281 SDValue Val;
3282
3283 if (VA.isRegLoc()) {
3284 Val =
3285 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3286 Chain = Val.getValue(1);
3287 InGlue = Val.getValue(2);
3288 } else if (VA.isMemLoc()) {
3289 report_fatal_error("TODO: return values in memory");
3290 } else
3291 llvm_unreachable("unknown argument location type");
3292
3293 switch (VA.getLocInfo()) {
3294 case CCValAssign::Full:
3295 break;
3296 case CCValAssign::BCvt:
3297 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3298 break;
3299 case CCValAssign::ZExt:
3300 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3301 DAG.getValueType(VA.getValVT()));
3302 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3303 break;
3304 case CCValAssign::SExt:
3305 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3306 DAG.getValueType(VA.getValVT()));
3307 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3308 break;
3309 case CCValAssign::AExt:
3310 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3311 break;
3312 default:
3313 llvm_unreachable("Unknown loc info!");
3314 }
3315
3316 InVals.push_back(Val);
3317 }
3318
3319 return Chain;
3320}
3321
3322// Add code to pass special inputs required depending on used features separate
3323// from the explicit user arguments present in the IR.
3325 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3326 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3327 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3328 // If we don't have a call site, this was a call inserted by
3329 // legalization. These can never use special inputs.
3330 if (!CLI.CB)
3331 return;
3332
3333 SelectionDAG &DAG = CLI.DAG;
3334 const SDLoc &DL = CLI.DL;
3335 const Function &F = DAG.getMachineFunction().getFunction();
3336
3337 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3338 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3339
3340 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3342 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3343 // DAG.getPass() returns nullptr when using new pass manager.
3344 // TODO: Use DAG.getMFAM() to access analysis result.
3345 if (DAG.getPass()) {
3346 auto &ArgUsageInfo =
3348 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3349 }
3350 }
3351
3352 // TODO: Unify with private memory register handling. This is complicated by
3353 // the fact that at least in kernels, the input argument is not necessarily
3354 // in the same location as the input.
3355 // clang-format off
3356 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3358 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3359 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3360 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3361 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3362 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3363 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3364 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3365 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3366 };
3367 // clang-format on
3368
3369 for (auto [InputID, Attr] : ImplicitAttrs) {
3370 // If the callee does not use the attribute value, skip copying the value.
3371 if (CLI.CB->hasFnAttr(Attr))
3372 continue;
3373
3374 const auto [OutgoingArg, ArgRC, ArgTy] =
3375 CalleeArgInfo->getPreloadedValue(InputID);
3376 if (!OutgoingArg)
3377 continue;
3378
3379 const auto [IncomingArg, IncomingArgRC, Ty] =
3380 CallerArgInfo.getPreloadedValue(InputID);
3381 assert(IncomingArgRC == ArgRC);
3382
3383 // All special arguments are ints for now.
3384 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3385 SDValue InputReg;
3386
3387 if (IncomingArg) {
3388 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3389 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3390 // The implicit arg ptr is special because it doesn't have a corresponding
3391 // input for kernels, and is computed from the kernarg segment pointer.
3392 InputReg = getImplicitArgPtr(DAG, DL);
3393 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3394 std::optional<uint32_t> Id =
3396 if (Id.has_value()) {
3397 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3398 } else {
3399 InputReg = DAG.getUNDEF(ArgVT);
3400 }
3401 } else {
3402 // We may have proven the input wasn't needed, although the ABI is
3403 // requiring it. We just need to allocate the register appropriately.
3404 InputReg = DAG.getUNDEF(ArgVT);
3405 }
3406
3407 if (OutgoingArg->isRegister()) {
3408 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3409 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3410 report_fatal_error("failed to allocate implicit input argument");
3411 } else {
3412 unsigned SpecialArgOffset =
3413 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3414 SDValue ArgStore =
3415 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3416 MemOpChains.push_back(ArgStore);
3417 }
3418 }
3419
3420 // Pack workitem IDs into a single register or pass it as is if already
3421 // packed.
3422
3423 auto [OutgoingArg, ArgRC, Ty] =
3425 if (!OutgoingArg)
3426 std::tie(OutgoingArg, ArgRC, Ty) =
3428 if (!OutgoingArg)
3429 std::tie(OutgoingArg, ArgRC, Ty) =
3431 if (!OutgoingArg)
3432 return;
3433
3434 const ArgDescriptor *IncomingArgX = std::get<0>(
3436 const ArgDescriptor *IncomingArgY = std::get<0>(
3438 const ArgDescriptor *IncomingArgZ = std::get<0>(
3440
3441 SDValue InputReg;
3442 SDLoc SL;
3443
3444 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3445 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3446 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3447
3448 // If incoming ids are not packed we need to pack them.
3449 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3450 NeedWorkItemIDX) {
3451 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3452 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3453 } else {
3454 InputReg = DAG.getConstant(0, DL, MVT::i32);
3455 }
3456 }
3457
3458 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3459 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3460 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3461 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3462 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3463 InputReg = InputReg.getNode()
3464 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3465 : Y;
3466 }
3467
3468 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3469 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3470 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3471 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3472 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3473 InputReg = InputReg.getNode()
3474 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3475 : Z;
3476 }
3477
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3480 // We're in a situation where the outgoing function requires the workitem
3481 // ID, but the calling function does not have it (e.g a graphics function
3482 // calling a C calling convention function). This is illegal, but we need
3483 // to produce something.
3484 InputReg = DAG.getUNDEF(MVT::i32);
3485 } else {
3486 // Workitem ids are already packed, any of present incoming arguments
3487 // will carry all required fields.
3488 ArgDescriptor IncomingArg =
3489 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3490 : IncomingArgY ? *IncomingArgY
3491 : *IncomingArgZ,
3492 ~0u);
3493 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3494 }
3495 }
3496
3497 if (OutgoingArg->isRegister()) {
3498 if (InputReg)
3499 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3500
3501 CCInfo.AllocateReg(OutgoingArg->getRegister());
3502 } else {
3503 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3504 if (InputReg) {
3505 SDValue ArgStore =
3506 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3507 MemOpChains.push_back(ArgStore);
3508 }
3509 }
3510}
3511
3513 return CC == CallingConv::Fast;
3514}
3515
3516/// Return true if we might ever do TCO for calls with this calling convention.
3518 switch (CC) {
3519 case CallingConv::C:
3521 return true;
3522 default:
3523 return canGuaranteeTCO(CC);
3524 }
3525}
3526
3528 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3530 const SmallVectorImpl<SDValue> &OutVals,
3531 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3532 if (AMDGPU::isChainCC(CalleeCC))
3533 return true;
3534
3535 if (!mayTailCallThisCC(CalleeCC))
3536 return false;
3537
3538 // For a divergent call target, we need to do a waterfall loop over the
3539 // possible callees which precludes us from using a simple jump.
3540 if (Callee->isDivergent())
3541 return false;
3542
3544 const Function &CallerF = MF.getFunction();
3545 CallingConv::ID CallerCC = CallerF.getCallingConv();
3547 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3548
3549 // Kernels aren't callable, and don't have a live in return address so it
3550 // doesn't make sense to do a tail call with entry functions.
3551 if (!CallerPreserved)
3552 return false;
3553
3554 bool CCMatch = CallerCC == CalleeCC;
3555
3557 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3558 return true;
3559 return false;
3560 }
3561
3562 // TODO: Can we handle var args?
3563 if (IsVarArg)
3564 return false;
3565
3566 for (const Argument &Arg : CallerF.args()) {
3567 if (Arg.hasByValAttr())
3568 return false;
3569 }
3570
3571 LLVMContext &Ctx = *DAG.getContext();
3572
3573 // Check that the call results are passed in the same way.
3574 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3575 CCAssignFnForCall(CalleeCC, IsVarArg),
3576 CCAssignFnForCall(CallerCC, IsVarArg)))
3577 return false;
3578
3579 // The callee has to preserve all registers the caller needs to preserve.
3580 if (!CCMatch) {
3581 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3582 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3583 return false;
3584 }
3585
3586 // Nothing more to check if the callee is taking no arguments.
3587 if (Outs.empty())
3588 return true;
3589
3591 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3592
3593 // FIXME: We are not allocating special input registers, so we will be
3594 // deciding based on incorrect register assignments.
3595 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3596
3597 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3598 // If the stack arguments for this call do not fit into our own save area then
3599 // the call cannot be made tail.
3600 // TODO: Is this really necessary?
3601 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3602 return false;
3603
3604 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3605 // FIXME: What about inreg arguments that end up passed in memory?
3606 if (!CCVA.isRegLoc())
3607 continue;
3608
3609 // If we are passing an argument in an SGPR, and the value is divergent,
3610 // this call requires a waterfall loop.
3611 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3612 LLVM_DEBUG(
3613 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3614 << printReg(CCVA.getLocReg(), TRI) << '\n');
3615 return false;
3616 }
3617 }
3618
3619 const MachineRegisterInfo &MRI = MF.getRegInfo();
3620 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3621}
3622
3624 if (!CI->isTailCall())
3625 return false;
3626
3627 const Function *ParentFn = CI->getParent()->getParent();
3629 return false;
3630 return true;
3631}
3632
3633// The wave scratch offset register is used as the global base pointer.
3635 SmallVectorImpl<SDValue> &InVals) const {
3636 CallingConv::ID CallConv = CLI.CallConv;
3637 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3638
3639 SelectionDAG &DAG = CLI.DAG;
3640
3641 TargetLowering::ArgListEntry RequestedExec;
3642 if (IsChainCallConv) {
3643 // The last argument should be the value that we need to put in EXEC.
3644 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3645 // don't treat it like the rest of the arguments.
3646 RequestedExec = CLI.Args.back();
3647 assert(RequestedExec.Node && "No node for EXEC");
3648
3649 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3650 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3651
3652 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3653 CLI.Outs.pop_back();
3654 CLI.OutVals.pop_back();
3655
3656 if (RequestedExec.Ty->isIntegerTy(64)) {
3657 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3658 CLI.Outs.pop_back();
3659 CLI.OutVals.pop_back();
3660 }
3661
3662 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3663 "Haven't popped all the pieces of the EXEC mask");
3664 }
3665
3666 const SDLoc &DL = CLI.DL;
3668 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3670 SDValue Chain = CLI.Chain;
3671 SDValue Callee = CLI.Callee;
3672 bool &IsTailCall = CLI.IsTailCall;
3673 bool IsVarArg = CLI.IsVarArg;
3674 bool IsSibCall = false;
3676
3677 if (Callee.isUndef() || isNullConstant(Callee)) {
3678 if (!CLI.IsTailCall) {
3679 for (ISD::InputArg &Arg : CLI.Ins)
3680 InVals.push_back(DAG.getUNDEF(Arg.VT));
3681 }
3682
3683 return Chain;
3684 }
3685
3686 if (IsVarArg) {
3687 return lowerUnhandledCall(CLI, InVals,
3688 "unsupported call to variadic function ");
3689 }
3690
3691 if (!CLI.CB)
3692 report_fatal_error("unsupported libcall legalization");
3693
3694 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3695 return lowerUnhandledCall(CLI, InVals,
3696 "unsupported required tail call to function ");
3697 }
3698
3699 if (IsTailCall) {
3700 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3701 Outs, OutVals, Ins, DAG);
3702 if (!IsTailCall &&
3703 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3704 report_fatal_error("failed to perform tail call elimination on a call "
3705 "site marked musttail or on llvm.amdgcn.cs.chain");
3706 }
3707
3708 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3709
3710 // A sibling call is one where we're under the usual C ABI and not planning
3711 // to change that but can still do a tail call:
3712 if (!TailCallOpt && IsTailCall)
3713 IsSibCall = true;
3714
3715 if (IsTailCall)
3716 ++NumTailCalls;
3717 }
3718
3721 SmallVector<SDValue, 8> MemOpChains;
3722
3723 // Analyze operands of the call, assigning locations to each operand.
3725 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3726 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3727
3728 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3729 // With a fixed ABI, allocate fixed registers before user arguments.
3730 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3731 }
3732
3733 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3734
3735 // Get a count of how many bytes are to be pushed on the stack.
3736 unsigned NumBytes = CCInfo.getStackSize();
3737
3738 if (IsSibCall) {
3739 // Since we're not changing the ABI to make this a tail call, the memory
3740 // operands are already available in the caller's incoming argument space.
3741 NumBytes = 0;
3742 }
3743
3744 // FPDiff is the byte offset of the call's argument area from the callee's.
3745 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3746 // by this amount for a tail call. In a sibling call it must be 0 because the
3747 // caller will deallocate the entire stack and the callee still expects its
3748 // arguments to begin at SP+0. Completely unused for non-tail calls.
3749 int32_t FPDiff = 0;
3750 MachineFrameInfo &MFI = MF.getFrameInfo();
3751 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3752
3753 // Adjust the stack pointer for the new arguments...
3754 // These operations are automatically eliminated by the prolog/epilog pass
3755 if (!IsSibCall)
3756 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3757
3758 if (!IsSibCall || IsChainCallConv) {
3759 if (!Subtarget->enableFlatScratch()) {
3760 SmallVector<SDValue, 4> CopyFromChains;
3761
3762 // In the HSA case, this should be an identity copy.
3763 SDValue ScratchRSrcReg =
3764 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3765 RegsToPass.emplace_back(IsChainCallConv
3766 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3767 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3768 ScratchRSrcReg);
3769 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3770 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3771 }
3772 }
3773
3774 const unsigned NumSpecialInputs = RegsToPass.size();
3775
3776 MVT PtrVT = MVT::i32;
3777
3778 // Walk the register/memloc assignments, inserting copies/loads.
3779 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3780 CCValAssign &VA = ArgLocs[i];
3781 SDValue Arg = OutVals[i];
3782
3783 // Promote the value if needed.
3784 switch (VA.getLocInfo()) {
3785 case CCValAssign::Full:
3786 break;
3787 case CCValAssign::BCvt:
3788 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3789 break;
3790 case CCValAssign::ZExt:
3791 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3792 break;
3793 case CCValAssign::SExt:
3794 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3795 break;
3796 case CCValAssign::AExt:
3797 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3798 break;
3799 case CCValAssign::FPExt:
3800 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3801 break;
3802 default:
3803 llvm_unreachable("Unknown loc info!");
3804 }
3805
3806 if (VA.isRegLoc()) {
3807 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3808 } else {
3809 assert(VA.isMemLoc());
3810
3811 SDValue DstAddr;
3812 MachinePointerInfo DstInfo;
3813
3814 unsigned LocMemOffset = VA.getLocMemOffset();
3815 int32_t Offset = LocMemOffset;
3816
3817 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3818 MaybeAlign Alignment;
3819
3820 if (IsTailCall) {
3821 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3822 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3823 : VA.getValVT().getStoreSize();
3824
3825 // FIXME: We can have better than the minimum byval required alignment.
3826 Alignment =
3827 Flags.isByVal()
3828 ? Flags.getNonZeroByValAlign()
3829 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3830
3831 Offset = Offset + FPDiff;
3832 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3833
3834 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3835 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3836
3837 // Make sure any stack arguments overlapping with where we're storing
3838 // are loaded before this eventual operation. Otherwise they'll be
3839 // clobbered.
3840
3841 // FIXME: Why is this really necessary? This seems to just result in a
3842 // lot of code to copy the stack and write them back to the same
3843 // locations, which are supposed to be immutable?
3844 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3845 } else {
3846 // Stores to the argument stack area are relative to the stack pointer.
3847 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3848 MVT::i32);
3849 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3850 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3851 Alignment =
3852 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3853 }
3854
3855 if (Outs[i].Flags.isByVal()) {
3856 SDValue SizeNode =
3857 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3858 SDValue Cpy =
3859 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3860 Outs[i].Flags.getNonZeroByValAlign(),
3861 /*isVol = */ false, /*AlwaysInline = */ true,
3862 /*CI=*/nullptr, std::nullopt, DstInfo,
3864
3865 MemOpChains.push_back(Cpy);
3866 } else {
3867 SDValue Store =
3868 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3869 MemOpChains.push_back(Store);
3870 }
3871 }
3872 }
3873
3874 if (!MemOpChains.empty())
3875 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3876
3877 SDValue ReadFirstLaneID =
3878 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3879
3880 SDValue TokenGlue;
3881 if (CLI.ConvergenceControlToken) {
3882 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3884 }
3885
3886 // Build a sequence of copy-to-reg nodes chained together with token chain
3887 // and flag operands which copy the outgoing args into the appropriate regs.
3888 SDValue InGlue;
3889
3890 unsigned ArgIdx = 0;
3891 for (auto [Reg, Val] : RegsToPass) {
3892 if (ArgIdx++ >= NumSpecialInputs &&
3893 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3894 // For chain calls, the inreg arguments are required to be
3895 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3896 // they are uniform.
3897 //
3898 // For other calls, if an inreg arguments is known to be uniform,
3899 // speculatively insert a readfirstlane in case it is in a VGPR.
3900 //
3901 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3902 // value, so let that continue to produce invalid code.
3903
3904 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3905 if (TokenGlue)
3906 ReadfirstlaneArgs.push_back(TokenGlue);
3908 ReadfirstlaneArgs);
3909 }
3910
3911 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3912 InGlue = Chain.getValue(1);
3913 }
3914
3915 // We don't usually want to end the call-sequence here because we would tidy
3916 // the frame up *after* the call, however in the ABI-changing tail-call case
3917 // we've carefully laid out the parameters so that when sp is reset they'll be
3918 // in the correct location.
3919 if (IsTailCall && !IsSibCall) {
3920 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3921 InGlue = Chain.getValue(1);
3922 }
3923
3924 std::vector<SDValue> Ops({Chain});
3925
3926 // Add a redundant copy of the callee global which will not be legalized, as
3927 // we need direct access to the callee later.
3928 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3929 const GlobalValue *GV = GSD->getGlobal();
3930 Ops.push_back(Callee);
3931 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3932 } else {
3933 if (IsTailCall) {
3934 // isEligibleForTailCallOptimization considered whether the call target is
3935 // divergent, but we may still end up with a uniform value in a VGPR.
3936 // Insert a readfirstlane just in case.
3937 SDValue ReadFirstLaneID =
3938 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3939
3940 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3941 if (TokenGlue)
3942 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3943 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3944 ReadfirstlaneArgs);
3945 }
3946
3947 Ops.push_back(Callee);
3948 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3949 }
3950
3951 if (IsTailCall) {
3952 // Each tail call may have to adjust the stack by a different amount, so
3953 // this information must travel along with the operation for eventual
3954 // consumption by emitEpilogue.
3955 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3956 }
3957
3958 if (IsChainCallConv)
3959 Ops.push_back(RequestedExec.Node);
3960
3961 // Add argument registers to the end of the list so that they are known live
3962 // into the call.
3963 for (auto &[Reg, Val] : RegsToPass)
3964 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3965
3966 // Add a register mask operand representing the call-preserved registers.
3967 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3968 assert(Mask && "Missing call preserved mask for calling convention");
3969 Ops.push_back(DAG.getRegisterMask(Mask));
3970
3971 if (SDValue Token = CLI.ConvergenceControlToken) {
3973 GlueOps.push_back(Token);
3974 if (InGlue)
3975 GlueOps.push_back(InGlue);
3976
3977 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3978 MVT::Glue, GlueOps),
3979 0);
3980 }
3981
3982 if (InGlue)
3983 Ops.push_back(InGlue);
3984
3985 // If we're doing a tall call, use a TC_RETURN here rather than an
3986 // actual call instruction.
3987 if (IsTailCall) {
3988 MFI.setHasTailCall();
3989 unsigned OPC = AMDGPUISD::TC_RETURN;
3990 switch (CallConv) {
3993 break;
3997 break;
3998 }
3999
4000 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4001 }
4002
4003 // Returns a chain and a flag for retval copy to use.
4004 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4005 Chain = Call.getValue(0);
4006 InGlue = Call.getValue(1);
4007
4008 uint64_t CalleePopBytes = NumBytes;
4009 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4010 if (!Ins.empty())
4011 InGlue = Chain.getValue(1);
4012
4013 // Handle result values, copying them out of physregs into vregs that we
4014 // return.
4015 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4016 InVals, /*IsThisReturn=*/false, SDValue());
4017}
4018
4019// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020// except for:
4021// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4022// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4024 SelectionDAG &DAG) const {
4025 const MachineFunction &MF = DAG.getMachineFunction();
4027
4028 SDLoc dl(Op);
4029 EVT VT = Op.getValueType();
4030 SDValue Chain = Op.getOperand(0);
4031 Register SPReg = Info->getStackPtrOffsetReg();
4032
4033 // Chain the dynamic stack allocation so that it doesn't modify the stack
4034 // pointer when other instructions are using the stack.
4035 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4036
4037 SDValue Size = Op.getOperand(1);
4038 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4039 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4040
4041 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4043 "Stack grows upwards for AMDGPU");
4044
4045 Chain = BaseAddr.getValue(1);
4046 Align StackAlign = TFL->getStackAlign();
4047 if (Alignment > StackAlign) {
4048 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4049 << Subtarget->getWavefrontSizeLog2();
4050 uint64_t StackAlignMask = ScaledAlignment - 1;
4051 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4052 DAG.getConstant(StackAlignMask, dl, VT));
4053 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4054 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4055 }
4056
4057 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4058 SDValue NewSP;
4059 if (isa<ConstantSDNode>(Size)) {
4060 // For constant sized alloca, scale alloca size by wave-size
4061 SDValue ScaledSize = DAG.getNode(
4062 ISD::SHL, dl, VT, Size,
4063 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4064 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4065 } else {
4066 // For dynamic sized alloca, perform wave-wide reduction to get max of
4067 // alloca size(divergent) and then scale it by wave-size
4068 SDValue WaveReduction =
4069 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4070 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4071 Size, DAG.getConstant(0, dl, MVT::i32));
4072 SDValue ScaledSize = DAG.getNode(
4073 ISD::SHL, dl, VT, Size,
4074 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4075 NewSP =
4076 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4077 SDValue ReadFirstLaneID =
4078 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4079 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4080 NewSP);
4081 }
4082
4083 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4084 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4085
4086 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4087}
4088
4090 if (Op.getValueType() != MVT::i32)
4091 return Op; // Defer to cannot select error.
4092
4094 SDLoc SL(Op);
4095
4096 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4097
4098 // Convert from wave uniform to swizzled vector address. This should protect
4099 // from any edge cases where the stacksave result isn't directly used with
4100 // stackrestore.
4101 SDValue VectorAddress =
4102 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4103 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4104}
4105
4107 SelectionDAG &DAG) const {
4108 SDLoc SL(Op);
4109 assert(Op.getValueType() == MVT::i32);
4110
4111 uint32_t BothRoundHwReg =
4113 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4114
4115 SDValue IntrinID =
4116 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4117 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4118 Op.getOperand(0), IntrinID, GetRoundBothImm);
4119
4120 // There are two rounding modes, one for f32 and one for f64/f16. We only
4121 // report in the standard value range if both are the same.
4122 //
4123 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4124 // ties away from zero is not supported, and the other values are rotated by
4125 // 1.
4126 //
4127 // If the two rounding modes are not the same, report a target defined value.
4128
4129 // Mode register rounding mode fields:
4130 //
4131 // [1:0] Single-precision round mode.
4132 // [3:2] Double/Half-precision round mode.
4133 //
4134 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4135 //
4136 // Hardware Spec
4137 // Toward-0 3 0
4138 // Nearest Even 0 1
4139 // +Inf 1 2
4140 // -Inf 2 3
4141 // NearestAway0 N/A 4
4142 //
4143 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4144 // table we can index by the raw hardware mode.
4145 //
4146 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4147
4148 SDValue BitTable =
4150
4151 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4152 SDValue RoundModeTimesNumBits =
4153 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4154
4155 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4156 // knew only one mode was demanded.
4157 SDValue TableValue =
4158 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4159 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4160
4161 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4162 SDValue TableEntry =
4163 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4164
4165 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4166 // if it's an extended value.
4167 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4168 SDValue IsStandardValue =
4169 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4170 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4171 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4172 TableEntry, EnumOffset);
4173
4174 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4175}
4176
4178 SelectionDAG &DAG) const {
4179 SDLoc SL(Op);
4180
4181 SDValue NewMode = Op.getOperand(1);
4182 assert(NewMode.getValueType() == MVT::i32);
4183
4184 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4185 // hardware MODE.fp_round values.
4186 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4187 uint32_t ClampedVal = std::min(
4188 static_cast<uint32_t>(ConstMode->getZExtValue()),
4190 NewMode = DAG.getConstant(
4191 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4192 } else {
4193 // If we know the input can only be one of the supported standard modes in
4194 // the range 0-3, we can use a simplified mapping to hardware values.
4195 KnownBits KB = DAG.computeKnownBits(NewMode);
4196 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4197 // The supported standard values are 0-3. The extended values start at 8. We
4198 // need to offset by 4 if the value is in the extended range.
4199
4200 if (UseReducedTable) {
4201 // Truncate to the low 32-bits.
4202 SDValue BitTable = DAG.getConstant(
4203 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4204
4205 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4206 SDValue RoundModeTimesNumBits =
4207 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4208
4209 NewMode =
4210 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4211
4212 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4213 // the table extracted bits into inline immediates.
4214 } else {
4215 // table_index = umin(value, value - 4)
4216 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4217 SDValue BitTable =
4219
4220 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4221 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4222 SDValue IndexVal =
4223 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4224
4225 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4226 SDValue RoundModeTimesNumBits =
4227 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4228
4229 SDValue TableValue =
4230 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4231 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4232
4233 // No need to mask out the high bits since the setreg will ignore them
4234 // anyway.
4235 NewMode = TruncTable;
4236 }
4237
4238 // Insert a readfirstlane in case the value is a VGPR. We could do this
4239 // earlier and keep more operations scalar, but that interferes with
4240 // combining the source.
4241 SDValue ReadFirstLaneID =
4242 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4243 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4244 ReadFirstLaneID, NewMode);
4245 }
4246
4247 // N.B. The setreg will be later folded into s_round_mode on supported
4248 // targets.
4249 SDValue IntrinID =
4250 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4251 uint32_t BothRoundHwReg =
4253 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4254
4255 SDValue SetReg =
4256 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4257 IntrinID, RoundBothImm, NewMode);
4258
4259 return SetReg;
4260}
4261
4263 if (Op->isDivergent())
4264 return SDValue();
4265
4266 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4271 break;
4272 default:
4273 return SDValue();
4274 }
4275
4276 return Op;
4277}
4278
4279// Work around DAG legality rules only based on the result type.
4281 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4282 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4283 EVT SrcVT = Src.getValueType();
4284
4285 if (SrcVT.getScalarType() != MVT::bf16)
4286 return Op;
4287
4288 SDLoc SL(Op);
4289 SDValue BitCast =
4290 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4291
4292 EVT DstVT = Op.getValueType();
4293 if (IsStrict)
4294 llvm_unreachable("Need STRICT_BF16_TO_FP");
4295
4296 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4297}
4298
4300 SDLoc SL(Op);
4301 if (Op.getValueType() != MVT::i64)
4302 return Op;
4303
4304 uint32_t ModeHwReg =
4306 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4307 uint32_t TrapHwReg =
4309 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4310
4311 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4312 SDValue IntrinID =
4313 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4314 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4315 Op.getOperand(0), IntrinID, ModeHwRegImm);
4316 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4317 Op.getOperand(0), IntrinID, TrapHwRegImm);
4318 SDValue TokenReg =
4319 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4320 GetTrapReg.getValue(1));
4321
4322 SDValue CvtPtr =
4323 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4324 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4325
4326 return DAG.getMergeValues({Result, TokenReg}, SL);
4327}
4328
4330 SDLoc SL(Op);
4331 if (Op.getOperand(1).getValueType() != MVT::i64)
4332 return Op;
4333
4334 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4335 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4336 DAG.getConstant(0, SL, MVT::i32));
4337 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4338 DAG.getConstant(1, SL, MVT::i32));
4339
4340 SDValue ReadFirstLaneID =
4341 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4342 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4343 ReadFirstLaneID, NewModeReg);
4344 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4345 ReadFirstLaneID, NewTrapReg);
4346
4347 unsigned ModeHwReg =
4349 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4350 unsigned TrapHwReg =
4352 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4353
4354 SDValue IntrinID =
4355 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4356 SDValue SetModeReg =
4357 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4358 IntrinID, ModeHwRegImm, NewModeReg);
4359 SDValue SetTrapReg =
4360 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4361 IntrinID, TrapHwRegImm, NewTrapReg);
4362 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4363}
4364
4366 const MachineFunction &MF) const {
4368 .Case("m0", AMDGPU::M0)
4369 .Case("exec", AMDGPU::EXEC)
4370 .Case("exec_lo", AMDGPU::EXEC_LO)
4371 .Case("exec_hi", AMDGPU::EXEC_HI)
4372 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4373 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4374 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4375 .Default(Register());
4376
4377 if (Reg == AMDGPU::NoRegister) {
4379 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4380 }
4381
4382 if (!Subtarget->hasFlatScrRegister() &&
4383 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4384 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4385 "\" for subtarget."));
4386 }
4387
4388 switch (Reg) {
4389 case AMDGPU::M0:
4390 case AMDGPU::EXEC_LO:
4391 case AMDGPU::EXEC_HI:
4392 case AMDGPU::FLAT_SCR_LO:
4393 case AMDGPU::FLAT_SCR_HI:
4394 if (VT.getSizeInBits() == 32)
4395 return Reg;
4396 break;
4397 case AMDGPU::EXEC:
4398 case AMDGPU::FLAT_SCR:
4399 if (VT.getSizeInBits() == 64)
4400 return Reg;
4401 break;
4402 default:
4403 llvm_unreachable("missing register type checking");
4404 }
4405
4407 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4408}
4409
4410// If kill is not the last instruction, split the block so kill is always a
4411// proper terminator.
4414 MachineBasicBlock *BB) const {
4415 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4417 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4418 return SplitBB;
4419}
4420
4421// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4422// \p MI will be the only instruction in the loop body block. Otherwise, it will
4423// be the first instruction in the remainder block.
4424//
4425/// \returns { LoopBody, Remainder }
4426static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4430
4431 // To insert the loop we need to split the block. Move everything after this
4432 // point to a new block, and insert a new empty block between the two.
4434 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4436 ++MBBI;
4437
4438 MF->insert(MBBI, LoopBB);
4439 MF->insert(MBBI, RemainderBB);
4440
4441 LoopBB->addSuccessor(LoopBB);
4442 LoopBB->addSuccessor(RemainderBB);
4443
4444 // Move the rest of the block into a new block.
4445 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4446
4447 if (InstInLoop) {
4448 auto Next = std::next(I);
4449
4450 // Move instruction to loop body.
4451 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4452
4453 // Move the rest of the block.
4454 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4455 } else {
4456 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4457 }
4458
4459 MBB.addSuccessor(LoopBB);
4460
4461 return std::pair(LoopBB, RemainderBB);
4462}
4463
4464/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4466 MachineBasicBlock *MBB = MI.getParent();
4468 auto I = MI.getIterator();
4469 auto E = std::next(I);
4470
4471 // clang-format off
4472 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4473 .addImm(0);
4474 // clang-format on
4475
4476 MIBundleBuilder Bundler(*MBB, I, E);
4477 finalizeBundle(*MBB, Bundler.begin());
4478}
4479
4482 MachineBasicBlock *BB) const {
4483 const DebugLoc &DL = MI.getDebugLoc();
4484
4486
4488
4489 // Apparently kill flags are only valid if the def is in the same block?
4490 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4491 Src->setIsKill(false);
4492
4493 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4494
4495 MachineBasicBlock::iterator I = LoopBB->end();
4496
4497 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4499
4500 // Clear TRAP_STS.MEM_VIOL
4501 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4502 .addImm(0)
4503 .addImm(EncodedReg);
4504
4506
4507 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4508
4509 // Load and check TRAP_STS.MEM_VIOL
4510 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4511 .addImm(EncodedReg);
4512
4513 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4514 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4515 .addReg(Reg, RegState::Kill)
4516 .addImm(0);
4517 // clang-format off
4518 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4519 .addMBB(LoopBB);
4520 // clang-format on
4521
4522 return RemainderBB;
4523}
4524
4525// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4526// wavefront. If the value is uniform and just happens to be in a VGPR, this
4527// will only do one iteration. In the worst case, this will loop 64 times.
4528//
4529// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4532 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4533 const DebugLoc &DL, const MachineOperand &Idx,
4534 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4535 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4536 Register &SGPRIdxReg) {
4537
4538 MachineFunction *MF = OrigBB.getParent();
4539 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4540 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4542
4543 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4544 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4545 Register NewExec = MRI.createVirtualRegister(BoolRC);
4546 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4547 Register CondReg = MRI.createVirtualRegister(BoolRC);
4548
4549 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4550 .addReg(InitReg)
4551 .addMBB(&OrigBB)
4552 .addReg(ResultReg)
4553 .addMBB(&LoopBB);
4554
4555 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4556 .addReg(InitSaveExecReg)
4557 .addMBB(&OrigBB)
4558 .addReg(NewExec)
4559 .addMBB(&LoopBB);
4560
4561 // Read the next variant <- also loop target.
4562 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4563 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4564
4565 // Compare the just read M0 value to all possible Idx values.
4566 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4567 .addReg(CurrentIdxReg)
4568 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4569
4570 // Update EXEC, save the original EXEC value to VCC.
4571 BuildMI(LoopBB, I, DL,
4572 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4573 : AMDGPU::S_AND_SAVEEXEC_B64),
4574 NewExec)
4575 .addReg(CondReg, RegState::Kill);
4576
4577 MRI.setSimpleHint(NewExec, CondReg);
4578
4579 if (UseGPRIdxMode) {
4580 if (Offset == 0) {
4581 SGPRIdxReg = CurrentIdxReg;
4582 } else {
4583 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4584 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4585 .addReg(CurrentIdxReg, RegState::Kill)
4586 .addImm(Offset);
4587 }
4588 } else {
4589 // Move index from VCC into M0
4590 if (Offset == 0) {
4591 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4592 .addReg(CurrentIdxReg, RegState::Kill);
4593 } else {
4594 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4595 .addReg(CurrentIdxReg, RegState::Kill)
4596 .addImm(Offset);
4597 }
4598 }
4599
4600 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4601 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4602 MachineInstr *InsertPt =
4603 BuildMI(LoopBB, I, DL,
4604 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4605 : AMDGPU::S_XOR_B64_term),
4606 Exec)
4607 .addReg(Exec)
4608 .addReg(NewExec);
4609
4610 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4611 // s_cbranch_scc0?
4612
4613 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4614 // clang-format off
4615 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4616 .addMBB(&LoopBB);
4617 // clang-format on
4618
4619 return InsertPt->getIterator();
4620}
4621
4622// This has slightly sub-optimal regalloc when the source vector is killed by
4623// the read. The register allocator does not understand that the kill is
4624// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4625// subregister from it, using 1 more VGPR than necessary. This was saved when
4626// this was expanded after register allocation.
4629 unsigned InitResultReg, unsigned PhiReg, int Offset,
4630 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4632 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4633 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4635 const DebugLoc &DL = MI.getDebugLoc();
4637
4638 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4639 Register DstReg = MI.getOperand(0).getReg();
4640 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4641 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4642 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4643 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4644
4645 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4646
4647 // Save the EXEC mask
4648 // clang-format off
4649 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4650 .addReg(Exec);
4651 // clang-format on
4652
4653 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4654
4655 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4656
4657 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4658 InitResultReg, DstReg, PhiReg, TmpExec,
4659 Offset, UseGPRIdxMode, SGPRIdxReg);
4660
4661 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4663 ++MBBI;
4664 MF->insert(MBBI, LandingPad);
4665 LoopBB->removeSuccessor(RemainderBB);
4666 LandingPad->addSuccessor(RemainderBB);
4667 LoopBB->addSuccessor(LandingPad);
4668 MachineBasicBlock::iterator First = LandingPad->begin();
4669 // clang-format off
4670 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4671 .addReg(SaveExec);
4672 // clang-format on
4673
4674 return InsPt;
4675}
4676
4677// Returns subreg index, offset
4678static std::pair<unsigned, int>
4680 const TargetRegisterClass *SuperRC, unsigned VecReg,
4681 int Offset) {
4682 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4683
4684 // Skip out of bounds offsets, or else we would end up using an undefined
4685 // register.
4686 if (Offset >= NumElts || Offset < 0)
4687 return std::pair(AMDGPU::sub0, Offset);
4688
4689 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4690}
4691
4694 int Offset) {
4695 MachineBasicBlock *MBB = MI.getParent();
4696 const DebugLoc &DL = MI.getDebugLoc();
4698
4699 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4700
4701 assert(Idx->getReg() != AMDGPU::NoRegister);
4702
4703 if (Offset == 0) {
4704 // clang-format off
4705 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4706 .add(*Idx);
4707 // clang-format on
4708 } else {
4709 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4710 .add(*Idx)
4711 .addImm(Offset);
4712 }
4713}
4714
4717 int Offset) {
4718 MachineBasicBlock *MBB = MI.getParent();
4719 const DebugLoc &DL = MI.getDebugLoc();
4721
4722 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4723
4724 if (Offset == 0)
4725 return Idx->getReg();
4726
4727 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4728 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4729 .add(*Idx)
4730 .addImm(Offset);
4731 return Tmp;
4732}
4733
4736 const GCNSubtarget &ST) {
4737 const SIInstrInfo *TII = ST.getInstrInfo();
4738 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4741
4742 Register Dst = MI.getOperand(0).getReg();
4743 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4744 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4745 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4746
4747 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4748 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4749
4750 unsigned SubReg;
4751 std::tie(SubReg, Offset) =
4752 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4753
4754 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4755
4756 // Check for a SGPR index.
4757 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4759 const DebugLoc &DL = MI.getDebugLoc();
4760
4761 if (UseGPRIdxMode) {
4762 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4763 // to avoid interfering with other uses, so probably requires a new
4764 // optimization pass.
4766
4767 const MCInstrDesc &GPRIDXDesc =
4768 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4769 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4770 .addReg(SrcReg)
4771 .addReg(Idx)
4772 .addImm(SubReg);
4773 } else {
4775
4776 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4777 .addReg(SrcReg, 0, SubReg)
4778 .addReg(SrcReg, RegState::Implicit);
4779 }
4780
4781 MI.eraseFromParent();
4782
4783 return &MBB;
4784 }
4785
4786 // Control flow needs to be inserted if indexing with a VGPR.
4787 const DebugLoc &DL = MI.getDebugLoc();
4789
4790 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4791 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4792
4793 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4794
4795 Register SGPRIdxReg;
4796 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4797 UseGPRIdxMode, SGPRIdxReg);
4798
4799 MachineBasicBlock *LoopBB = InsPt->getParent();
4800
4801 if (UseGPRIdxMode) {
4802 const MCInstrDesc &GPRIDXDesc =
4803 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4804
4805 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4806 .addReg(SrcReg)
4807 .addReg(SGPRIdxReg)
4808 .addImm(SubReg);
4809 } else {
4810 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4811 .addReg(SrcReg, 0, SubReg)
4812 .addReg(SrcReg, RegState::Implicit);
4813 }
4814
4815 MI.eraseFromParent();
4816
4817 return LoopBB;
4818}
4819
4822 const GCNSubtarget &ST) {
4823 const SIInstrInfo *TII = ST.getInstrInfo();
4824 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4827
4828 Register Dst = MI.getOperand(0).getReg();
4829 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4830 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4831 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4832 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4833 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4834 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4835
4836 // This can be an immediate, but will be folded later.
4837 assert(Val->getReg());
4838
4839 unsigned SubReg;
4840 std::tie(SubReg, Offset) =
4841 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4842 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4843
4844 if (Idx->getReg() == AMDGPU::NoRegister) {
4846 const DebugLoc &DL = MI.getDebugLoc();
4847
4848 assert(Offset == 0);
4849
4850 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4851 .add(*SrcVec)
4852 .add(*Val)
4853 .addImm(SubReg);
4854
4855 MI.eraseFromParent();
4856 return &MBB;
4857 }
4858
4859 // Check for a SGPR index.
4860 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4862 const DebugLoc &DL = MI.getDebugLoc();
4863
4864 if (UseGPRIdxMode) {
4866
4867 const MCInstrDesc &GPRIDXDesc =
4868 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4869 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4870 .addReg(SrcVec->getReg())
4871 .add(*Val)
4872 .addReg(Idx)
4873 .addImm(SubReg);
4874 } else {
4876
4877 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4878 TRI.getRegSizeInBits(*VecRC), 32, false);
4879 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4880 .addReg(SrcVec->getReg())
4881 .add(*Val)
4882 .addImm(SubReg);
4883 }
4884 MI.eraseFromParent();
4885 return &MBB;
4886 }
4887
4888 // Control flow needs to be inserted if indexing with a VGPR.
4889 if (Val->isReg())
4890 MRI.clearKillFlags(Val->getReg());
4891
4892 const DebugLoc &DL = MI.getDebugLoc();
4893
4894 Register PhiReg = MRI.createVirtualRegister(VecRC);
4895
4896 Register SGPRIdxReg;
4897 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4898 UseGPRIdxMode, SGPRIdxReg);
4899 MachineBasicBlock *LoopBB = InsPt->getParent();
4900
4901 if (UseGPRIdxMode) {
4902 const MCInstrDesc &GPRIDXDesc =
4903 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4904
4905 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4906 .addReg(PhiReg)
4907 .add(*Val)
4908 .addReg(SGPRIdxReg)
4909 .addImm(SubReg);
4910 } else {
4911 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4912 TRI.getRegSizeInBits(*VecRC), 32, false);
4913 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4914 .addReg(PhiReg)
4915 .add(*Val)
4916 .addImm(SubReg);
4917 }
4918
4919 MI.eraseFromParent();
4920 return LoopBB;
4921}
4922
4925 const GCNSubtarget &ST,
4926 unsigned Opc) {
4928 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4929 const DebugLoc &DL = MI.getDebugLoc();
4930 const SIInstrInfo *TII = ST.getInstrInfo();
4931
4932 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4933 Register SrcReg = MI.getOperand(1).getReg();
4934 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4935 Register DstReg = MI.getOperand(0).getReg();
4936 MachineBasicBlock *RetBB = nullptr;
4937 if (isSGPR) {
4938 // These operations with a uniform value i.e. SGPR are idempotent.
4939 // Reduced value will be same as given sgpr.
4940 // clang-format off
4941 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4942 .addReg(SrcReg);
4943 // clang-format on
4944 RetBB = &BB;
4945 } else {
4946 // TODO: Implement DPP Strategy and switch based on immediate strategy
4947 // operand. For now, for all the cases (default, Iterative and DPP we use
4948 // iterative approach by default.)
4949
4950 // To reduce the VGPR using iterative approach, we need to iterate
4951 // over all the active lanes. Lowering consists of ComputeLoop,
4952 // which iterate over only active lanes. We use copy of EXEC register
4953 // as induction variable and every active lane modifies it using bitset0
4954 // so that we will get the next active lane for next iteration.
4956 Register SrcReg = MI.getOperand(1).getReg();
4957
4958 // Create Control flow for loop
4959 // Split MI's Machine Basic block into For loop
4960 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4961
4962 // Create virtual registers required for lowering.
4963 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4964 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4965 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4966 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4967
4968 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4969 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4970 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4971
4972 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4973 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4974
4975 bool IsWave32 = ST.isWave32();
4976 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4977 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4978
4979 // Create initail values of induction variable from Exec, Accumulator and
4980 // insert branch instr to newly created ComputeBlockk
4981 uint32_t InitalValue =
4982 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4983 auto TmpSReg =
4984 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4985 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4986 .addImm(InitalValue);
4987 // clang-format off
4988 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
4989 .addMBB(ComputeLoop);
4990 // clang-format on
4991
4992 // Start constructing ComputeLoop
4993 I = ComputeLoop->end();
4994 auto Accumulator =
4995 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4996 .addReg(InitalValReg)
4997 .addMBB(&BB);
4998 auto ActiveBits =
4999 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5000 .addReg(TmpSReg->getOperand(0).getReg())
5001 .addMBB(&BB);
5002
5003 // Perform the computations
5004 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5005 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5006 .addReg(ActiveBits->getOperand(0).getReg());
5007 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5008 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5009 .addReg(SrcReg)
5010 .addReg(FF1->getOperand(0).getReg());
5011 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5012 .addReg(Accumulator->getOperand(0).getReg())
5013 .addReg(LaneValue->getOperand(0).getReg());
5014
5015 // Manipulate the iterator to get the next active lane
5016 unsigned BITSETOpc =
5017 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5018 auto NewActiveBits =
5019 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5020 .addReg(FF1->getOperand(0).getReg())
5021 .addReg(ActiveBits->getOperand(0).getReg());
5022
5023 // Add phi nodes
5024 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5025 .addMBB(ComputeLoop);
5026 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5027 .addMBB(ComputeLoop);
5028
5029 // Creating branching
5030 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5031 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5032 .addReg(NewActiveBits->getOperand(0).getReg())
5033 .addImm(0);
5034 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5035 .addMBB(ComputeLoop);
5036
5037 RetBB = ComputeEnd;
5038 }
5039 MI.eraseFromParent();
5040 return RetBB;
5041}
5042
5045 MachineBasicBlock *BB) const {
5046
5048 MachineFunction *MF = BB->getParent();
5050
5051 switch (MI.getOpcode()) {
5052 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5053 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5054 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5055 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5056 case AMDGPU::S_UADDO_PSEUDO:
5057 case AMDGPU::S_USUBO_PSEUDO: {
5058 const DebugLoc &DL = MI.getDebugLoc();
5059 MachineOperand &Dest0 = MI.getOperand(0);
5060 MachineOperand &Dest1 = MI.getOperand(1);
5061 MachineOperand &Src0 = MI.getOperand(2);
5062 MachineOperand &Src1 = MI.getOperand(3);
5063
5064 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5065 ? AMDGPU::S_ADD_I32
5066 : AMDGPU::S_SUB_I32;
5067 // clang-format off
5068 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5069 .add(Src0)
5070 .add(Src1);
5071 // clang-format on
5072
5073 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5074 .addImm(1)
5075 .addImm(0);
5076
5077 MI.eraseFromParent();
5078 return BB;
5079 }
5080 case AMDGPU::S_ADD_U64_PSEUDO:
5081 case AMDGPU::S_SUB_U64_PSEUDO: {
5082 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5083 // For GFX12, we emit s_add_u64 and s_sub_u64.
5084 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5086 const DebugLoc &DL = MI.getDebugLoc();
5087 MachineOperand &Dest = MI.getOperand(0);
5088 MachineOperand &Src0 = MI.getOperand(1);
5089 MachineOperand &Src1 = MI.getOperand(2);
5090 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5091 if (Subtarget->hasScalarAddSub64()) {
5092 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5093 // clang-format off
5094 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5095 .add(Src0)
5096 .add(Src1);
5097 // clang-format on
5098 } else {
5099 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5100 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5101
5102 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5103 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5104
5105 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5106 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5107 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5108 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5109
5110 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5111 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5112 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5113 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5114
5115 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5116 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5117 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5118 .add(Src0Sub0)
5119 .add(Src1Sub0);
5120 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5121 .add(Src0Sub1)
5122 .add(Src1Sub1);
5123 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5124 .addReg(DestSub0)
5125 .addImm(AMDGPU::sub0)
5126 .addReg(DestSub1)
5127 .addImm(AMDGPU::sub1);
5128 }
5129 MI.eraseFromParent();
5130 return BB;
5131 }
5132 case AMDGPU::V_ADD_U64_PSEUDO:
5133 case AMDGPU::V_SUB_U64_PSEUDO: {
5135 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5136 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5137 const DebugLoc &DL = MI.getDebugLoc();
5138
5139 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5140
5141 MachineOperand &Dest = MI.getOperand(0);
5142 MachineOperand &Src0 = MI.getOperand(1);
5143 MachineOperand &Src1 = MI.getOperand(2);
5144
5145 if (IsAdd && ST.hasLshlAddB64()) {
5146 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5147 Dest.getReg())
5148 .add(Src0)
5149 .addImm(0)
5150 .add(Src1);
5151 TII->legalizeOperands(*Add);
5152 MI.eraseFromParent();
5153 return BB;
5154 }
5155
5156 const auto *CarryRC = TRI->getWaveMaskRegClass();
5157
5158 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5159 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5160
5161 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5162 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5163
5164 const TargetRegisterClass *Src0RC = Src0.isReg()
5165 ? MRI.getRegClass(Src0.getReg())
5166 : &AMDGPU::VReg_64RegClass;
5167 const TargetRegisterClass *Src1RC = Src1.isReg()
5168 ? MRI.getRegClass(Src1.getReg())
5169 : &AMDGPU::VReg_64RegClass;
5170
5171 const TargetRegisterClass *Src0SubRC =
5172 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5173 const TargetRegisterClass *Src1SubRC =
5174 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5175
5176 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5177 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5178 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5179 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5180
5181 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5182 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5183 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5184 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5185
5186 unsigned LoOpc =
5187 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5188 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5189 .addReg(CarryReg, RegState::Define)
5190 .add(SrcReg0Sub0)
5191 .add(SrcReg1Sub0)
5192 .addImm(0); // clamp bit
5193
5194 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5195 MachineInstr *HiHalf =
5196 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5197 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5198 .add(SrcReg0Sub1)
5199 .add(SrcReg1Sub1)
5200 .addReg(CarryReg, RegState::Kill)
5201 .addImm(0); // clamp bit
5202
5203 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5204 .addReg(DestSub0)
5205 .addImm(AMDGPU::sub0)
5206 .addReg(DestSub1)
5207 .addImm(AMDGPU::sub1);
5208 TII->legalizeOperands(*LoHalf);
5209 TII->legalizeOperands(*HiHalf);
5210 MI.eraseFromParent();
5211 return BB;
5212 }
5213 case AMDGPU::S_ADD_CO_PSEUDO:
5214 case AMDGPU::S_SUB_CO_PSEUDO: {
5215 // This pseudo has a chance to be selected
5216 // only from uniform add/subcarry node. All the VGPR operands
5217 // therefore assumed to be splat vectors.
5219 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5220 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5222 const DebugLoc &DL = MI.getDebugLoc();
5223 MachineOperand &Dest = MI.getOperand(0);
5224 MachineOperand &CarryDest = MI.getOperand(1);
5225 MachineOperand &Src0 = MI.getOperand(2);
5226 MachineOperand &Src1 = MI.getOperand(3);
5227 MachineOperand &Src2 = MI.getOperand(4);
5228 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5229 ? AMDGPU::S_ADDC_U32
5230 : AMDGPU::S_SUBB_U32;
5231 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5232 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5233 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5234 .addReg(Src0.getReg());
5235 Src0.setReg(RegOp0);
5236 }
5237 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5238 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5239 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5240 .addReg(Src1.getReg());
5241 Src1.setReg(RegOp1);
5242 }
5243 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5244 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5245 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5246 .addReg(Src2.getReg());
5247 Src2.setReg(RegOp2);
5248 }
5249
5250 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5251 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5252 assert(WaveSize == 64 || WaveSize == 32);
5253
5254 if (WaveSize == 64) {
5255 if (ST.hasScalarCompareEq64()) {
5256 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5257 .addReg(Src2.getReg())
5258 .addImm(0);
5259 } else {
5260 const TargetRegisterClass *SubRC =
5261 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5262 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5263 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5264 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5265 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5266 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5267
5268 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5269 .add(Src2Sub0)
5270 .add(Src2Sub1);
5271
5272 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5273 .addReg(Src2_32, RegState::Kill)
5274 .addImm(0);
5275 }
5276 } else {
5277 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5278 .addReg(Src2.getReg())
5279 .addImm(0);
5280 }
5281
5282 // clang-format off
5283 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5284 .add(Src0)
5285 .add(Src1);
5286 // clang-format on
5287
5288 unsigned SelOpc =
5289 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5290
5291 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5292 .addImm(-1)
5293 .addImm(0);
5294
5295 MI.eraseFromParent();
5296 return BB;
5297 }
5298 case AMDGPU::SI_INIT_M0: {
5299 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5300 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5301 .add(MI.getOperand(0));
5302 MI.eraseFromParent();
5303 return BB;
5304 }
5305 case AMDGPU::GET_GROUPSTATICSIZE: {
5306 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5307 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5308 DebugLoc DL = MI.getDebugLoc();
5309 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5310 .add(MI.getOperand(0))
5311 .addImm(MFI->getLDSSize());
5312 MI.eraseFromParent();
5313 return BB;
5314 }
5315 case AMDGPU::GET_SHADERCYCLESHILO: {
5318 const DebugLoc &DL = MI.getDebugLoc();
5319 // The algorithm is:
5320 //
5321 // hi1 = getreg(SHADER_CYCLES_HI)
5322 // lo1 = getreg(SHADER_CYCLES_LO)
5323 // hi2 = getreg(SHADER_CYCLES_HI)
5324 //
5325 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5326 // Otherwise there was overflow and the result is hi2:0. In both cases the
5327 // result should represent the actual time at some point during the sequence
5328 // of three getregs.
5329 using namespace AMDGPU::Hwreg;
5330 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5331 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5332 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5333 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5334 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5335 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5336 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5337 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5338 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5339 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5340 .addReg(RegHi1)
5341 .addReg(RegHi2);
5342 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5343 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5344 .addReg(RegLo1)
5345 .addImm(0);
5346 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5347 .add(MI.getOperand(0))
5348 .addReg(RegLo)
5349 .addImm(AMDGPU::sub0)
5350 .addReg(RegHi2)
5351 .addImm(AMDGPU::sub1);
5352 MI.eraseFromParent();
5353 return BB;
5354 }
5355 case AMDGPU::SI_INDIRECT_SRC_V1:
5356 case AMDGPU::SI_INDIRECT_SRC_V2:
5357 case AMDGPU::SI_INDIRECT_SRC_V4:
5358 case AMDGPU::SI_INDIRECT_SRC_V8:
5359 case AMDGPU::SI_INDIRECT_SRC_V9:
5360 case AMDGPU::SI_INDIRECT_SRC_V10:
5361 case AMDGPU::SI_INDIRECT_SRC_V11:
5362 case AMDGPU::SI_INDIRECT_SRC_V12:
5363 case AMDGPU::SI_INDIRECT_SRC_V16:
5364 case AMDGPU::SI_INDIRECT_SRC_V32:
5365 return emitIndirectSrc(MI, *BB, *getSubtarget());
5366 case AMDGPU::SI_INDIRECT_DST_V1:
5367 case AMDGPU::SI_INDIRECT_DST_V2:
5368 case AMDGPU::SI_INDIRECT_DST_V4:
5369 case AMDGPU::SI_INDIRECT_DST_V8:
5370 case AMDGPU::SI_INDIRECT_DST_V9:
5371 case AMDGPU::SI_INDIRECT_DST_V10:
5372 case AMDGPU::SI_INDIRECT_DST_V11:
5373 case AMDGPU::SI_INDIRECT_DST_V12:
5374 case AMDGPU::SI_INDIRECT_DST_V16:
5375 case AMDGPU::SI_INDIRECT_DST_V32:
5376 return emitIndirectDst(MI, *BB, *getSubtarget());
5377 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5378 case AMDGPU::SI_KILL_I1_PSEUDO:
5379 return splitKillBlock(MI, BB);
5380 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5382 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5383 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5384
5385 Register Dst = MI.getOperand(0).getReg();
5386 const MachineOperand &Src0 = MI.getOperand(1);
5387 const MachineOperand &Src1 = MI.getOperand(2);
5388 const DebugLoc &DL = MI.getDebugLoc();
5389 Register SrcCond = MI.getOperand(3).getReg();
5390
5391 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5392 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5393 const auto *CondRC = TRI->getWaveMaskRegClass();
5394 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5395
5396 const TargetRegisterClass *Src0RC = Src0.isReg()
5397 ? MRI.getRegClass(Src0.getReg())
5398 : &AMDGPU::VReg_64RegClass;
5399 const TargetRegisterClass *Src1RC = Src1.isReg()
5400 ? MRI.getRegClass(Src1.getReg())
5401 : &AMDGPU::VReg_64RegClass;
5402
5403 const TargetRegisterClass *Src0SubRC =
5404 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5405 const TargetRegisterClass *Src1SubRC =
5406 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5407
5408 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5409 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5410 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5411 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5412
5413 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5414 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5415 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5416 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5417
5418 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5419 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5420 .addImm(0)
5421 .add(Src0Sub0)
5422 .addImm(0)
5423 .add(Src1Sub0)
5424 .addReg(SrcCondCopy);
5425 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5426 .addImm(0)
5427 .add(Src0Sub1)
5428 .addImm(0)
5429 .add(Src1Sub1)
5430 .addReg(SrcCondCopy);
5431
5432 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5433 .addReg(DstLo)
5434 .addImm(AMDGPU::sub0)
5435 .addReg(DstHi)
5436 .addImm(AMDGPU::sub1);
5437 MI.eraseFromParent();
5438 return BB;
5439 }
5440 case AMDGPU::SI_BR_UNDEF: {
5442 const DebugLoc &DL = MI.getDebugLoc();
5443 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5444 .add(MI.getOperand(0));
5445 Br->getOperand(1).setIsUndef(); // read undef SCC
5446 MI.eraseFromParent();
5447 return BB;
5448 }
5449 case AMDGPU::ADJCALLSTACKUP:
5450 case AMDGPU::ADJCALLSTACKDOWN: {
5452 MachineInstrBuilder MIB(*MF, &MI);
5453 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5454 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5455 return BB;
5456 }
5457 case AMDGPU::SI_CALL_ISEL: {
5459 const DebugLoc &DL = MI.getDebugLoc();
5460
5461 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5462
5464 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5465
5466 for (const MachineOperand &MO : MI.operands())
5467 MIB.add(MO);
5468
5469 MIB.cloneMemRefs(MI);
5470 MI.eraseFromParent();
5471 return BB;
5472 }
5473 case AMDGPU::V_ADD_CO_U32_e32:
5474 case AMDGPU::V_SUB_CO_U32_e32:
5475 case AMDGPU::V_SUBREV_CO_U32_e32: {
5476 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5477 const DebugLoc &DL = MI.getDebugLoc();
5478 unsigned Opc = MI.getOpcode();
5479
5480 bool NeedClampOperand = false;
5481 if (TII->pseudoToMCOpcode(Opc) == -1) {
5482 Opc = AMDGPU::getVOPe64(Opc);
5483 NeedClampOperand = true;
5484 }
5485
5486 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5487 if (TII->isVOP3(*I)) {
5488 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5489 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5490 I.addReg(TRI->getVCC(), RegState::Define);
5491 }
5492 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5493 if (NeedClampOperand)
5494 I.addImm(0); // clamp bit for e64 encoding
5495
5496 TII->legalizeOperands(*I);
5497
5498 MI.eraseFromParent();
5499 return BB;
5500 }
5501 case AMDGPU::V_ADDC_U32_e32:
5502 case AMDGPU::V_SUBB_U32_e32:
5503 case AMDGPU::V_SUBBREV_U32_e32:
5504 // These instructions have an implicit use of vcc which counts towards the
5505 // constant bus limit.
5506 TII->legalizeOperands(MI);
5507 return BB;
5508 case AMDGPU::DS_GWS_INIT:
5509 case AMDGPU::DS_GWS_SEMA_BR:
5510 case AMDGPU::DS_GWS_BARRIER:
5511 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5512 [[fallthrough]];
5513 case AMDGPU::DS_GWS_SEMA_V:
5514 case AMDGPU::DS_GWS_SEMA_P:
5515 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5516 // A s_waitcnt 0 is required to be the instruction immediately following.
5517 if (getSubtarget()->hasGWSAutoReplay()) {
5519 return BB;
5520 }
5521
5522 return emitGWSMemViolTestLoop(MI, BB);
5523 case AMDGPU::S_SETREG_B32: {
5524 // Try to optimize cases that only set the denormal mode or rounding mode.
5525 //
5526 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5527 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5528 // instead.
5529 //
5530 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5531 // allow you to have a no side effect instruction in the output of a
5532 // sideeffecting pattern.
5533 auto [ID, Offset, Width] =
5534 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5536 return BB;
5537
5538 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5539 const unsigned SetMask = WidthMask << Offset;
5540
5541 if (getSubtarget()->hasDenormModeInst()) {
5542 unsigned SetDenormOp = 0;
5543 unsigned SetRoundOp = 0;
5544
5545 // The dedicated instructions can only set the whole denorm or round mode
5546 // at once, not a subset of bits in either.
5547 if (SetMask ==
5549 // If this fully sets both the round and denorm mode, emit the two
5550 // dedicated instructions for these.
5551 SetRoundOp = AMDGPU::S_ROUND_MODE;
5552 SetDenormOp = AMDGPU::S_DENORM_MODE;
5553 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5554 SetRoundOp = AMDGPU::S_ROUND_MODE;
5555 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5556 SetDenormOp = AMDGPU::S_DENORM_MODE;
5557 }
5558
5559 if (SetRoundOp || SetDenormOp) {
5561 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5562 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5563 unsigned ImmVal = Def->getOperand(1).getImm();
5564 if (SetRoundOp) {
5565 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5566 .addImm(ImmVal & 0xf);
5567
5568 // If we also have the denorm mode, get just the denorm mode bits.
5569 ImmVal >>= 4;
5570 }
5571
5572 if (SetDenormOp) {
5573 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5574 .addImm(ImmVal & 0xf);
5575 }
5576
5577 MI.eraseFromParent();
5578 return BB;
5579 }
5580 }
5581 }
5582
5583 // If only FP bits are touched, used the no side effects pseudo.
5584 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5585 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5586 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5587
5588 return BB;
5589 }
5590 case AMDGPU::S_INVERSE_BALLOT_U32:
5591 case AMDGPU::S_INVERSE_BALLOT_U64:
5592 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5593 // necessary. After that they are equivalent to a COPY.
5594 MI.setDesc(TII->get(AMDGPU::COPY));
5595 return BB;
5596 case AMDGPU::ENDPGM_TRAP: {
5597 const DebugLoc &DL = MI.getDebugLoc();
5598 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5599 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5600 MI.addOperand(MachineOperand::CreateImm(0));
5601 return BB;
5602 }
5603
5604 // We need a block split to make the real endpgm a terminator. We also don't
5605 // want to break phis in successor blocks, so we can't just delete to the
5606 // end of the block.
5607
5608 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5610 MF->push_back(TrapBB);
5611 // clang-format off
5612 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5613 .addImm(0);
5614 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5615 .addMBB(TrapBB);
5616 // clang-format on
5617
5618 BB->addSuccessor(TrapBB);
5619 MI.eraseFromParent();
5620 return SplitBB;
5621 }
5622 case AMDGPU::SIMULATED_TRAP: {
5623 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5625 MachineBasicBlock *SplitBB =
5626 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5627 MI.eraseFromParent();
5628 return SplitBB;
5629 }
5630 default:
5631 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5632 if (!MI.mayStore())
5634 return BB;
5635 }
5637 }
5638}
5639
5641 // This currently forces unfolding various combinations of fsub into fma with
5642 // free fneg'd operands. As long as we have fast FMA (controlled by
5643 // isFMAFasterThanFMulAndFAdd), we should perform these.
5644
5645 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5646 // most of these combines appear to be cycle neutral but save on instruction
5647 // count / code size.
5648 return true;
5649}
5650
5652
5654 EVT VT) const {
5655 if (!VT.isVector()) {
5656 return MVT::i1;
5657 }
5658 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5659}
5660
5662 // TODO: Should i16 be used always if legal? For now it would force VALU
5663 // shifts.
5664 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5665}
5666
5668 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5669 ? Ty.changeElementSize(16)
5670 : Ty.changeElementSize(32);
5671}
5672
5673// Answering this is somewhat tricky and depends on the specific device which
5674// have different rates for fma or all f64 operations.
5675//
5676// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5677// regardless of which device (although the number of cycles differs between
5678// devices), so it is always profitable for f64.
5679//
5680// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5681// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5682// which we can always do even without fused FP ops since it returns the same
5683// result as the separate operations and since it is always full
5684// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5685// however does not support denormals, so we do report fma as faster if we have
5686// a fast fma device and require denormals.
5687//
5689 EVT VT) const {
5690 VT = VT.getScalarType();
5691
5692 switch (VT.getSimpleVT().SimpleTy) {
5693 case MVT::f32: {
5694 // If mad is not available this depends only on if f32 fma is full rate.
5695 if (!Subtarget->hasMadMacF32Insts())
5696 return Subtarget->hasFastFMAF32();
5697
5698 // Otherwise f32 mad is always full rate and returns the same result as
5699 // the separate operations so should be preferred over fma.
5700 // However does not support denormals.
5702 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5703
5704 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5705 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5706 }
5707 case MVT::f64:
5708 return true;
5709 case MVT::f16:
5710 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5711 default:
5712 break;
5713 }
5714
5715 return false;
5716}
5717
5719 LLT Ty) const {
5720 switch (Ty.getScalarSizeInBits()) {
5721 case 16:
5722 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5723 case 32:
5724 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5725 case 64:
5726 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5727 default:
5728 break;
5729 }
5730
5731 return false;
5732}
5733
5735 if (!Ty.isScalar())
5736 return false;
5737
5738 if (Ty.getScalarSizeInBits() == 16)
5739 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5740 if (Ty.getScalarSizeInBits() == 32)
5741 return Subtarget->hasMadMacF32Insts() &&
5742 denormalModeIsFlushAllF32(*MI.getMF());
5743
5744 return false;
5745}
5746
5748 const SDNode *N) const {
5749 // TODO: Check future ftz flag
5750 // v_mad_f32/v_mac_f32 do not support denormals.
5751 EVT VT = N->getValueType(0);
5752 if (VT == MVT::f32)
5753 return Subtarget->hasMadMacF32Insts() &&
5755 if (VT == MVT::f16) {
5756 return Subtarget->hasMadF16() &&
5758 }
5759
5760 return false;
5761}
5762
5763//===----------------------------------------------------------------------===//
5764// Custom DAG Lowering Operations
5765//===----------------------------------------------------------------------===//
5766
5767// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5768// wider vector type is legal.
5770 SelectionDAG &DAG) const {
5771 unsigned Opc = Op.getOpcode();
5772 EVT VT = Op.getValueType();
5773 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5774 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5775 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5776 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5777
5778 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5779
5780 SDLoc SL(Op);
5781 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5782 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5783
5784 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5785}
5786
5787// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5788// wider vector type is legal.
5790 SelectionDAG &DAG) const {
5791 unsigned Opc = Op.getOpcode();
5792 EVT VT = Op.getValueType();
5793 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5794 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5795 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5796 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5797
5798 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5799 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5800
5801 SDLoc SL(Op);
5802
5803 SDValue OpLo =
5804 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5805 SDValue OpHi =
5806 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5807
5808 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5809}
5810
5812 SelectionDAG &DAG) const {
5813 unsigned Opc = Op.getOpcode();
5814 EVT VT = Op.getValueType();
5815 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5816 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5817 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5818 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5819 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5820 VT == MVT::v32bf16);
5821
5822 SDValue Op0 = Op.getOperand(0);
5823 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5824 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5825 : std::pair(Op0, Op0);
5826
5827 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5828 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5829
5830 SDLoc SL(Op);
5831 auto ResVT = DAG.GetSplitDestVTs(VT);
5832
5833 SDValue OpLo =
5834 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5835 SDValue OpHi =
5836 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5837
5838 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5839}
5840
5842 switch (Op.getOpcode()) {
5843 default:
5845 case ISD::BRCOND:
5846 return LowerBRCOND(Op, DAG);
5847 case ISD::RETURNADDR:
5848 return LowerRETURNADDR(Op, DAG);
5849 case ISD::LOAD: {
5850 SDValue Result = LowerLOAD(Op, DAG);
5851 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5852 "Load should return a value and a chain");
5853 return Result;
5854 }
5855 case ISD::FSQRT: {
5856 EVT VT = Op.getValueType();
5857 if (VT == MVT::f32)
5858 return lowerFSQRTF32(Op, DAG);
5859 if (VT == MVT::f64)
5860 return lowerFSQRTF64(Op, DAG);
5861 return SDValue();
5862 }
5863 case ISD::FSIN:
5864 case ISD::FCOS:
5865 return LowerTrig(Op, DAG);
5866 case ISD::SELECT:
5867 return LowerSELECT(Op, DAG);
5868 case ISD::FDIV:
5869 return LowerFDIV(Op, DAG);
5870 case ISD::FFREXP:
5871 return LowerFFREXP(Op, DAG);
5873 return LowerATOMIC_CMP_SWAP(Op, DAG);
5874 case ISD::STORE:
5875 return LowerSTORE(Op, DAG);
5876 case ISD::GlobalAddress: {
5879 return LowerGlobalAddress(MFI, Op, DAG);
5880 }
5882 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5884 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5886 return LowerINTRINSIC_VOID(Op, DAG);
5887 case ISD::ADDRSPACECAST:
5888 return lowerADDRSPACECAST(Op, DAG);
5890 return lowerINSERT_SUBVECTOR(Op, DAG);
5892 return lowerINSERT_VECTOR_ELT(Op, DAG);
5894 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5896 return lowerVECTOR_SHUFFLE(Op, DAG);
5898 return lowerSCALAR_TO_VECTOR(Op, DAG);
5899 case ISD::BUILD_VECTOR:
5900 return lowerBUILD_VECTOR(Op, DAG);
5901 case ISD::FP_ROUND:
5903 return lowerFP_ROUND(Op, DAG);
5904 case ISD::TRAP:
5905 return lowerTRAP(Op, DAG);
5906 case ISD::DEBUGTRAP:
5907 return lowerDEBUGTRAP(Op, DAG);
5908 case ISD::ABS:
5909 case ISD::FABS:
5910 case ISD::FNEG:
5911 case ISD::FCANONICALIZE:
5912 case ISD::BSWAP:
5913 return splitUnaryVectorOp(Op, DAG);
5914 case ISD::FMINNUM:
5915 case ISD::FMAXNUM:
5916 return lowerFMINNUM_FMAXNUM(Op, DAG);
5917 case ISD::FLDEXP:
5918 case ISD::STRICT_FLDEXP:
5919 return lowerFLDEXP(Op, DAG);
5920 case ISD::FMA:
5921 return splitTernaryVectorOp(Op, DAG);
5922 case ISD::FP_TO_SINT:
5923 case ISD::FP_TO_UINT:
5924 return LowerFP_TO_INT(Op, DAG);
5925 case ISD::SHL:
5926 case ISD::SRA:
5927 case ISD::SRL:
5928 case ISD::ADD:
5929 case ISD::SUB:
5930 case ISD::SMIN:
5931 case ISD::SMAX:
5932 case ISD::UMIN:
5933 case ISD::UMAX:
5934 case ISD::FADD:
5935 case ISD::FMUL:
5936 case ISD::FMINNUM_IEEE:
5937 case ISD::FMAXNUM_IEEE:
5938 case ISD::FMINIMUM:
5939 case ISD::FMAXIMUM:
5940 case ISD::FMINIMUMNUM:
5941 case ISD::FMAXIMUMNUM:
5942 case ISD::UADDSAT:
5943 case ISD::USUBSAT:
5944 case ISD::SADDSAT:
5945 case ISD::SSUBSAT:
5946 return splitBinaryVectorOp(Op, DAG);
5947 case ISD::MUL:
5948 return lowerMUL(Op, DAG);
5949 case ISD::SMULO:
5950 case ISD::UMULO:
5951 return lowerXMULO(Op, DAG);
5952 case ISD::SMUL_LOHI:
5953 case ISD::UMUL_LOHI:
5954 return lowerXMUL_LOHI(Op, DAG);
5956 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5957 case ISD::STACKSAVE:
5958 return LowerSTACKSAVE(Op, DAG);
5959 case ISD::GET_ROUNDING:
5960 return lowerGET_ROUNDING(Op, DAG);
5961 case ISD::SET_ROUNDING:
5962 return lowerSET_ROUNDING(Op, DAG);
5963 case ISD::PREFETCH:
5964 return lowerPREFETCH(Op, DAG);
5965 case ISD::FP_EXTEND:
5967 return lowerFP_EXTEND(Op, DAG);
5968 case ISD::GET_FPENV:
5969 return lowerGET_FPENV(Op, DAG);
5970 case ISD::SET_FPENV:
5971 return lowerSET_FPENV(Op, DAG);
5972 }
5973 return SDValue();
5974}
5975
5976// Used for D16: Casts the result of an instruction into the right vector,
5977// packs values if loads return unpacked values.
5979 const SDLoc &DL, SelectionDAG &DAG,
5980 bool Unpacked) {
5981 if (!LoadVT.isVector())
5982 return Result;
5983
5984 // Cast back to the original packed type or to a larger type that is a
5985 // multiple of 32 bit for D16. Widening the return type is a required for
5986 // legalization.
5987 EVT FittingLoadVT = LoadVT;
5988 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5989 FittingLoadVT =
5991 LoadVT.getVectorNumElements() + 1);
5992 }
5993
5994 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5995 // Truncate to v2i16/v4i16.
5996 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5997
5998 // Workaround legalizer not scalarizing truncate after vector op
5999 // legalization but not creating intermediate vector trunc.
6001 DAG.ExtractVectorElements(Result, Elts);
6002 for (SDValue &Elt : Elts)
6003 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6004
6005 // Pad illegal v1i16/v3fi6 to v4i16
6006 if ((LoadVT.getVectorNumElements() % 2) == 1)
6007 Elts.push_back(DAG.getUNDEF(MVT::i16));
6008
6009 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6010
6011 // Bitcast to original type (v2f16/v4f16).
6012 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6013 }
6014
6015 // Cast back to the original packed type.
6016 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6017}
6018
6019SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6020 SelectionDAG &DAG,
6022 bool IsIntrinsic) const {
6023 SDLoc DL(M);
6024
6025 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6026 EVT LoadVT = M->getValueType(0);
6027
6028 EVT EquivLoadVT = LoadVT;
6029 if (LoadVT.isVector()) {
6030 if (Unpacked) {
6031 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6032 LoadVT.getVectorNumElements());
6033 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6034 // Widen v3f16 to legal type
6035 EquivLoadVT =
6037 LoadVT.getVectorNumElements() + 1);
6038 }
6039 }
6040
6041 // Change from v4f16/v2f16 to EquivLoadVT.
6042 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6043
6045 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6046 M->getMemoryVT(), M->getMemOperand());
6047
6048 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6049
6050 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6051}
6052
6053SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6054 SelectionDAG &DAG,
6055 ArrayRef<SDValue> Ops) const {
6056 SDLoc DL(M);
6057 EVT LoadVT = M->getValueType(0);
6058 EVT EltType = LoadVT.getScalarType();
6059 EVT IntVT = LoadVT.changeTypeToInteger();
6060
6061 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6062
6063 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6064 bool IsTFE = M->getNumValues() == 3;
6065
6066 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6070
6071 if (IsD16) {
6072 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6073 }
6074
6075 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6076 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6077 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6078 IsTFE);
6079
6080 if (isTypeLegal(LoadVT)) {
6081 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6082 M->getMemOperand(), DAG);
6083 }
6084
6085 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6086 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6087 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6088 M->getMemOperand(), DAG);
6089 return DAG.getMergeValues(
6090 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6091 DL);
6092}
6093
6095 SelectionDAG &DAG) {
6096 EVT VT = N->getValueType(0);
6097 unsigned CondCode = N->getConstantOperandVal(3);
6098 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6099 return DAG.getUNDEF(VT);
6100
6101 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6102
6103 SDValue LHS = N->getOperand(1);
6104 SDValue RHS = N->getOperand(2);
6105
6106 SDLoc DL(N);
6107
6108 EVT CmpVT = LHS.getValueType();
6109 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6110 unsigned PromoteOp =
6112 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6113 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6114 }
6115
6116 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6117
6118 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6119 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6120
6121 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6122 DAG.getCondCode(CCOpcode));
6123 if (VT.bitsEq(CCVT))
6124 return SetCC;
6125 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6126}
6127
6129 SelectionDAG &DAG) {
6130 EVT VT = N->getValueType(0);
6131
6132 unsigned CondCode = N->getConstantOperandVal(3);
6133 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6134 return DAG.getUNDEF(VT);
6135
6136 SDValue Src0 = N->getOperand(1);
6137 SDValue Src1 = N->getOperand(2);
6138 EVT CmpVT = Src0.getValueType();
6139 SDLoc SL(N);
6140
6141 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6142 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6143 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6144 }
6145
6146 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6147 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6148 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6149 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6150 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6151 DAG.getCondCode(CCOpcode));
6152 if (VT.bitsEq(CCVT))
6153 return SetCC;
6154 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6155}
6156
6158 SelectionDAG &DAG) {
6159 EVT VT = N->getValueType(0);
6160 SDValue Src = N->getOperand(1);
6161 SDLoc SL(N);
6162
6163 if (Src.getOpcode() == ISD::SETCC) {
6164 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6165 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6166 Src.getOperand(1), Src.getOperand(2));
6167 }
6168 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6169 // (ballot 0) -> 0
6170 if (Arg->isZero())
6171 return DAG.getConstant(0, SL, VT);
6172
6173 // (ballot 1) -> EXEC/EXEC_LO
6174 if (Arg->isOne()) {
6175 Register Exec;
6176 if (VT.getScalarSizeInBits() == 32)
6177 Exec = AMDGPU::EXEC_LO;
6178 else if (VT.getScalarSizeInBits() == 64)
6179 Exec = AMDGPU::EXEC;
6180 else
6181 return SDValue();
6182
6183 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6184 }
6185 }
6186
6187 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6188 // ISD::SETNE)
6189 return DAG.getNode(
6190 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6191 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6192}
6193
6195 SelectionDAG &DAG) {
6196 EVT VT = N->getValueType(0);
6197 unsigned ValSize = VT.getSizeInBits();
6198 unsigned IID = N->getConstantOperandVal(0);
6199 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6200 IID == Intrinsic::amdgcn_permlanex16;
6201 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6202 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6203 SDLoc SL(N);
6204 MVT IntVT = MVT::getIntegerVT(ValSize);
6205 const GCNSubtarget *ST = TLI.getSubtarget();
6206 unsigned SplitSize = 32;
6207 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6208 ST->hasDPALU_DPP() &&
6209 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6210 SplitSize = 64;
6211
6212 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6213 SDValue Src2, MVT ValT) -> SDValue {
6215 switch (IID) {
6216 case Intrinsic::amdgcn_permlane16:
6217 case Intrinsic::amdgcn_permlanex16:
6218 case Intrinsic::amdgcn_update_dpp:
6219 Operands.push_back(N->getOperand(6));
6220 Operands.push_back(N->getOperand(5));
6221 Operands.push_back(N->getOperand(4));
6222 [[fallthrough]];
6223 case Intrinsic::amdgcn_writelane:
6224 Operands.push_back(Src2);
6225 [[fallthrough]];
6226 case Intrinsic::amdgcn_readlane:
6227 case Intrinsic::amdgcn_set_inactive:
6228 case Intrinsic::amdgcn_set_inactive_chain_arg:
6229 case Intrinsic::amdgcn_mov_dpp8:
6230 Operands.push_back(Src1);
6231 [[fallthrough]];
6232 case Intrinsic::amdgcn_readfirstlane:
6233 case Intrinsic::amdgcn_permlane64:
6234 Operands.push_back(Src0);
6235 break;
6236 default:
6237 llvm_unreachable("unhandled lane op");
6238 }
6239
6240 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6241 std::reverse(Operands.begin(), Operands.end());
6242
6243 if (SDNode *GL = N->getGluedNode()) {
6244 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6245 GL = GL->getOperand(0).getNode();
6246 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6247 SDValue(GL, 0)));
6248 }
6249
6250 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6251 };
6252
6253 SDValue Src0 = N->getOperand(1);
6254 SDValue Src1, Src2;
6255 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6256 IID == Intrinsic::amdgcn_mov_dpp8 ||
6257 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6258 Src1 = N->getOperand(2);
6259 if (IID == Intrinsic::amdgcn_writelane ||
6260 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6261 Src2 = N->getOperand(3);
6262 }
6263
6264 if (ValSize == SplitSize) {
6265 // Already legal
6266 return SDValue();
6267 }
6268
6269 if (ValSize < 32) {
6270 bool IsFloat = VT.isFloatingPoint();
6271 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6272 SL, MVT::i32);
6273
6274 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6275 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6276 SL, MVT::i32);
6277 }
6278
6279 if (IID == Intrinsic::amdgcn_writelane) {
6280 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6281 SL, MVT::i32);
6282 }
6283
6284 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6285 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6286 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6287 }
6288
6289 if (ValSize % SplitSize != 0)
6290 return SDValue();
6291
6292 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6293 EVT VT = N->getValueType(0);
6294 unsigned NE = VT.getVectorNumElements();
6295 EVT EltVT = VT.getVectorElementType();
6297 unsigned NumOperands = N->getNumOperands();
6298 SmallVector<SDValue, 4> Operands(NumOperands);
6299 SDNode *GL = N->getGluedNode();
6300
6301 // only handle convergencectrl_glue
6303
6304 for (unsigned i = 0; i != NE; ++i) {
6305 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6306 ++j) {
6307 SDValue Operand = N->getOperand(j);
6308 EVT OperandVT = Operand.getValueType();
6309 if (OperandVT.isVector()) {
6310 // A vector operand; extract a single element.
6311 EVT OperandEltVT = OperandVT.getVectorElementType();
6312 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6313 Operand, DAG.getVectorIdxConstant(i, SL));
6314 } else {
6315 // A scalar operand; just use it as is.
6316 Operands[j] = Operand;
6317 }
6318 }
6319
6320 if (GL)
6321 Operands[NumOperands - 1] =
6322 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6323 SDValue(GL->getOperand(0).getNode(), 0));
6324
6325 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6326 }
6327
6328 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6329 return DAG.getBuildVector(VecVT, SL, Scalars);
6330 };
6331
6332 if (VT.isVector()) {
6333 switch (MVT::SimpleValueType EltTy =
6335 case MVT::i32:
6336 case MVT::f32:
6337 if (SplitSize == 32) {
6338 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6339 return unrollLaneOp(LaneOp.getNode());
6340 }
6341 [[fallthrough]];
6342 case MVT::i16:
6343 case MVT::f16:
6344 case MVT::bf16: {
6345 unsigned SubVecNumElt =
6346 SplitSize / VT.getVectorElementType().getSizeInBits();
6347 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6349 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6350 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6351 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6352 DAG.getConstant(EltIdx, SL, MVT::i32));
6353
6354 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6355 IsPermLane16)
6356 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6357 DAG.getConstant(EltIdx, SL, MVT::i32));
6358
6359 if (IID == Intrinsic::amdgcn_writelane)
6360 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6361 DAG.getConstant(EltIdx, SL, MVT::i32));
6362
6363 Pieces.push_back(
6364 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6365 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6366 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6367 EltIdx += SubVecNumElt;
6368 }
6369 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6370 }
6371 default:
6372 // Handle all other cases by bitcasting to i32 vectors
6373 break;
6374 }
6375 }
6376
6377 MVT VecVT =
6378 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6379 Src0 = DAG.getBitcast(VecVT, Src0);
6380
6381 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6382 Src1 = DAG.getBitcast(VecVT, Src1);
6383
6384 if (IID == Intrinsic::amdgcn_writelane)
6385 Src2 = DAG.getBitcast(VecVT, Src2);
6386
6387 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6388 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6389 return DAG.getBitcast(VT, UnrolledLaneOp);
6390}
6391
6394 SelectionDAG &DAG) const {
6395 switch (N->getOpcode()) {
6397 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6398 Results.push_back(Res);
6399 return;
6400 }
6402 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6403 Results.push_back(Res);
6404 return;
6405 }
6407 unsigned IID = N->getConstantOperandVal(0);
6408 switch (IID) {
6409 case Intrinsic::amdgcn_make_buffer_rsrc:
6410 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6411 return;
6412 case Intrinsic::amdgcn_cvt_pkrtz: {
6413 SDValue Src0 = N->getOperand(1);
6414 SDValue Src1 = N->getOperand(2);
6415 SDLoc SL(N);
6416 SDValue Cvt =
6417 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6418 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6419 return;
6420 }
6421 case Intrinsic::amdgcn_cvt_pknorm_i16:
6422 case Intrinsic::amdgcn_cvt_pknorm_u16:
6423 case Intrinsic::amdgcn_cvt_pk_i16:
6424 case Intrinsic::amdgcn_cvt_pk_u16: {
6425 SDValue Src0 = N->getOperand(1);
6426 SDValue Src1 = N->getOperand(2);
6427 SDLoc SL(N);
6428 unsigned Opcode;
6429
6430 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6432 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6434 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6436 else
6438
6439 EVT VT = N->getValueType(0);
6440 if (isTypeLegal(VT))
6441 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6442 else {
6443 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6444 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6445 }
6446 return;
6447 }
6448 case Intrinsic::amdgcn_s_buffer_load: {
6449 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6450 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6451 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6452 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6453 // s_buffer_load_i8.
6454 if (!Subtarget->hasScalarSubwordLoads())
6455 return;
6456 SDValue Op = SDValue(N, 0);
6457 SDValue Rsrc = Op.getOperand(1);
6458 SDValue Offset = Op.getOperand(2);
6459 SDValue CachePolicy = Op.getOperand(3);
6460 EVT VT = Op.getValueType();
6461 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6462 SDLoc DL(Op);
6464 const DataLayout &DataLayout = DAG.getDataLayout();
6465 Align Alignment =
6471 VT.getStoreSize(), Alignment);
6472 SDValue LoadVal;
6473 if (!Offset->isDivergent()) {
6474 SDValue Ops[] = {Rsrc, // source register
6475 Offset, CachePolicy};
6476 SDValue BufferLoad =
6478 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6479 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6480 } else {
6481 SDValue Ops[] = {
6482 DAG.getEntryNode(), // Chain
6483 Rsrc, // rsrc
6484 DAG.getConstant(0, DL, MVT::i32), // vindex
6485 {}, // voffset
6486 {}, // soffset
6487 {}, // offset
6488 CachePolicy, // cachepolicy
6489 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6490 };
6491 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6492 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6493 }
6494 Results.push_back(LoadVal);
6495 return;
6496 }
6497 }
6498 break;
6499 }
6501 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6502 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6503 // FIXME: Hacky
6504 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6505 Results.push_back(Res.getOperand(I));
6506 }
6507 } else {
6508 Results.push_back(Res);
6509 Results.push_back(Res.getValue(1));
6510 }
6511 return;
6512 }
6513
6514 break;
6515 }
6516 case ISD::SELECT: {
6517 SDLoc SL(N);
6518 EVT VT = N->getValueType(0);
6519 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6520 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6521 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6522
6523 EVT SelectVT = NewVT;
6524 if (NewVT.bitsLT(MVT::i32)) {
6525 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6526 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6527 SelectVT = MVT::i32;
6528 }
6529
6530 SDValue NewSelect =
6531 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6532
6533 if (NewVT != SelectVT)
6534 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6535 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6536 return;
6537 }
6538 case ISD::FNEG: {
6539 if (N->getValueType(0) != MVT::v2f16)
6540 break;
6541
6542 SDLoc SL(N);
6543 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6544
6545 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6546 DAG.getConstant(0x80008000, SL, MVT::i32));
6547 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6548 return;
6549 }
6550 case ISD::FABS: {
6551 if (N->getValueType(0) != MVT::v2f16)
6552 break;
6553
6554 SDLoc SL(N);
6555 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6556
6557 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6558 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6559 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6560 return;
6561 }
6562 case ISD::FSQRT: {
6563 if (N->getValueType(0) != MVT::f16)
6564 break;
6565 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6566 break;
6567 }
6568 default:
6570 break;
6571 }
6572}
6573
6574/// Helper function for LowerBRCOND
6575static SDNode *findUser(SDValue Value, unsigned Opcode) {
6576
6577 for (SDUse &U : Value->uses()) {
6578 if (U.get() != Value)
6579 continue;
6580
6581 if (U.getUser()->getOpcode() == Opcode)
6582 return U.getUser();
6583 }
6584 return nullptr;
6585}
6586
6587unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6588 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6589 switch (Intr->getConstantOperandVal(1)) {
6590 case Intrinsic::amdgcn_if:
6591 return AMDGPUISD::IF;
6592 case Intrinsic::amdgcn_else:
6593 return AMDGPUISD::ELSE;
6594 case Intrinsic::amdgcn_loop:
6595 return AMDGPUISD::LOOP;
6596 case Intrinsic::amdgcn_end_cf:
6597 llvm_unreachable("should not occur");
6598 default:
6599 return 0;
6600 }
6601 }
6602
6603 // break, if_break, else_break are all only used as inputs to loop, not
6604 // directly as branch conditions.
6605 return 0;
6606}
6607
6609 const Triple &TT = getTargetMachine().getTargetTriple();
6613}
6614
6616 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6617 return false;
6618
6619 // FIXME: Either avoid relying on address space here or change the default
6620 // address space for functions to avoid the explicit check.
6621 return (GV->getValueType()->isFunctionTy() ||
6624}
6625
6627 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6628}
6629
6631 if (!GV->hasExternalLinkage())
6632 return true;
6633
6634 const auto OS = getTargetMachine().getTargetTriple().getOS();
6635 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6636}
6637
6638/// This transforms the control flow intrinsics to get the branch destination as
6639/// last parameter, also switches branch target with BR if the need arise
6640SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6641 SDLoc DL(BRCOND);
6642
6643 SDNode *Intr = BRCOND.getOperand(1).getNode();
6644 SDValue Target = BRCOND.getOperand(2);
6645 SDNode *BR = nullptr;
6646 SDNode *SetCC = nullptr;
6647
6648 if (Intr->getOpcode() == ISD::SETCC) {
6649 // As long as we negate the condition everything is fine
6650 SetCC = Intr;
6651 Intr = SetCC->getOperand(0).getNode();
6652
6653 } else {
6654 // Get the target from BR if we don't negate the condition
6655 BR = findUser(BRCOND, ISD::BR);
6656 assert(BR && "brcond missing unconditional branch user");
6657 Target = BR->getOperand(1);
6658 }
6659
6660 unsigned CFNode = isCFIntrinsic(Intr);
6661 if (CFNode == 0) {
6662 // This is a uniform branch so we don't need to legalize.
6663 return BRCOND;
6664 }
6665
6666 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6667 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6668
6669 assert(!SetCC ||
6670 (SetCC->getConstantOperandVal(1) == 1 &&
6671 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6672 ISD::SETNE));
6673
6674 // operands of the new intrinsic call
6676 if (HaveChain)
6677 Ops.push_back(BRCOND.getOperand(0));
6678
6679 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6680 Ops.push_back(Target);
6681
6682 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6683
6684 // build the new intrinsic call
6685 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6686
6687 if (!HaveChain) {
6688 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6689
6690 Result = DAG.getMergeValues(Ops, DL).getNode();
6691 }
6692
6693 if (BR) {
6694 // Give the branch instruction our target
6695 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6696 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6697 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6698 }
6699
6700 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6701
6702 // Copy the intrinsic results to registers
6703 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6705 if (!CopyToReg)
6706 continue;
6707
6708 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6709 SDValue(Result, i - 1), SDValue());
6710
6711 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6712 }
6713
6714 // Remove the old intrinsic from the chain
6715 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6716 Intr->getOperand(0));
6717
6718 return Chain;
6719}
6720
6721SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6722 MVT VT = Op.getSimpleValueType();
6723 SDLoc DL(Op);
6724 // Checking the depth
6725 if (Op.getConstantOperandVal(0) != 0)
6726 return DAG.getConstant(0, DL, VT);
6727
6730 // Check for kernel and shader functions
6731 if (Info->isEntryFunction())
6732 return DAG.getConstant(0, DL, VT);
6733
6734 MachineFrameInfo &MFI = MF.getFrameInfo();
6735 // There is a call to @llvm.returnaddress in this function
6736 MFI.setReturnAddressIsTaken(true);
6737
6739 // Get the return address reg and mark it as an implicit live-in
6740 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6741 getRegClassFor(VT, Op.getNode()->isDivergent()));
6742
6743 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6744}
6745
6746SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6747 const SDLoc &DL, EVT VT) const {
6748 return Op.getValueType().bitsLE(VT)
6749 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6750 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6751 DAG.getTargetConstant(0, DL, MVT::i32));
6752}
6753
6754SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6755 assert(Op.getValueType() == MVT::f16 &&
6756 "Do not know how to custom lower FP_ROUND for non-f16 type");
6757
6758 SDValue Src = Op.getOperand(0);
6759 EVT SrcVT = Src.getValueType();
6760 if (SrcVT != MVT::f64)
6761 return Op;
6762
6763 // TODO: Handle strictfp
6764 if (Op.getOpcode() != ISD::FP_ROUND)
6765 return Op;
6766
6767 SDLoc DL(Op);
6768
6769 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6770 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6771 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6772}
6773
6774SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6775 SelectionDAG &DAG) const {
6776 EVT VT = Op.getValueType();
6777 const MachineFunction &MF = DAG.getMachineFunction();
6779 bool IsIEEEMode = Info->getMode().IEEE;
6780
6781 // FIXME: Assert during selection that this is only selected for
6782 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6783 // mode functions, but this happens to be OK since it's only done in cases
6784 // where there is known no sNaN.
6785 if (IsIEEEMode)
6786 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6787
6788 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6789 VT == MVT::v16bf16)
6790 return splitBinaryVectorOp(Op, DAG);
6791 return Op;
6792}
6793
6794SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6795 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6796 EVT VT = Op.getValueType();
6797 assert(VT == MVT::f16);
6798
6799 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6800 EVT ExpVT = Exp.getValueType();
6801 if (ExpVT == MVT::i16)
6802 return Op;
6803
6804 SDLoc DL(Op);
6805
6806 // Correct the exponent type for f16 to i16.
6807 // Clamp the range of the exponent to the instruction's range.
6808
6809 // TODO: This should be a generic narrowing legalization, and can easily be
6810 // for GlobalISel.
6811
6812 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6813 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6814
6815 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6816 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6817
6818 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6819
6820 if (IsStrict) {
6821 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6822 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6823 }
6824
6825 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6826}
6827
6829 switch (Op->getOpcode()) {
6830 case ISD::SRA:
6831 case ISD::SMIN:
6832 case ISD::SMAX:
6833 return ISD::SIGN_EXTEND;
6834 case ISD::SRL:
6835 case ISD::UMIN:
6836 case ISD::UMAX:
6837 return ISD::ZERO_EXTEND;
6838 case ISD::ADD:
6839 case ISD::SUB:
6840 case ISD::AND:
6841 case ISD::OR:
6842 case ISD::XOR:
6843 case ISD::SHL:
6844 case ISD::SELECT:
6845 case ISD::MUL:
6846 // operation result won't be influenced by garbage high bits.
6847 // TODO: are all of those cases correct, and are there more?
6848 return ISD::ANY_EXTEND;
6849 case ISD::SETCC: {
6850 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6852 }
6853 default:
6854 llvm_unreachable("unexpected opcode!");
6855 }
6856}
6857
6858SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6859 DAGCombinerInfo &DCI) const {
6860 const unsigned Opc = Op.getOpcode();
6861 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6862 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6863 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6864 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6865 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6866
6867 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6868 : Op->getOperand(0).getValueType();
6869 auto ExtTy = OpTy.changeElementType(MVT::i32);
6870
6871 if (DCI.isBeforeLegalizeOps() ||
6872 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6873 return SDValue();
6874
6875 auto &DAG = DCI.DAG;
6876
6877 SDLoc DL(Op);
6878 SDValue LHS;
6879 SDValue RHS;
6880 if (Opc == ISD::SELECT) {
6881 LHS = Op->getOperand(1);
6882 RHS = Op->getOperand(2);
6883 } else {
6884 LHS = Op->getOperand(0);
6885 RHS = Op->getOperand(1);
6886 }
6887
6888 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6889 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6890
6891 // Special case: for shifts, the RHS always needs a zext.
6892 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6893 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6894 else
6895 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6896
6897 // setcc always return i1/i1 vec so no need to truncate after.
6898 if (Opc == ISD::SETCC) {
6899 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6900 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6901 }
6902
6903 // For other ops, we extend the operation's return type as well so we need to
6904 // truncate back to the original type.
6905 SDValue NewVal;
6906 if (Opc == ISD::SELECT)
6907 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6908 else
6909 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6910
6911 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6912}
6913
6914// Custom lowering for vector multiplications and s_mul_u64.
6915SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6916 EVT VT = Op.getValueType();
6917
6918 // Split vector operands.
6919 if (VT.isVector())
6920 return splitBinaryVectorOp(Op, DAG);
6921
6922 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6923
6924 // There are four ways to lower s_mul_u64:
6925 //
6926 // 1. If all the operands are uniform, then we lower it as it is.
6927 //
6928 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6929 // multiplications because there is not a vector equivalent of s_mul_u64.
6930 //
6931 // 3. If the cost model decides that it is more efficient to use vector
6932 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6933 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6934 //
6935 // 4. If the cost model decides to use vector registers and both of the
6936 // operands are zero-extended/sign-extended from 32-bits, then we split the
6937 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6938 // possible to check if the operands are zero-extended or sign-extended in
6939 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6940 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6941 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6942 // If the cost model decides that we have to use vector registers, then
6943 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6944 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6945 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6946 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6947 // SIInstrInfo.cpp .
6948
6949 if (Op->isDivergent())
6950 return SDValue();
6951
6952 SDValue Op0 = Op.getOperand(0);
6953 SDValue Op1 = Op.getOperand(1);
6954 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6955 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6956 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6957 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6958 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6959 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6960 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6961 SDLoc SL(Op);
6962 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6963 return SDValue(
6964 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6965 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6966 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6967 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6968 return SDValue(
6969 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6970 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6971 return Op;
6972}
6973
6974SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6975 EVT VT = Op.getValueType();
6976 SDLoc SL(Op);
6977 SDValue LHS = Op.getOperand(0);
6978 SDValue RHS = Op.getOperand(1);
6979 bool isSigned = Op.getOpcode() == ISD::SMULO;
6980
6981 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6982 const APInt &C = RHSC->getAPIntValue();
6983 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6984 if (C.isPowerOf2()) {
6985 // smulo(x, signed_min) is same as umulo(x, signed_min).
6986 bool UseArithShift = isSigned && !C.isMinSignedValue();
6987 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6988 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6989 SDValue Overflow =
6990 DAG.getSetCC(SL, MVT::i1,
6991 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
6992 Result, ShiftAmt),
6993 LHS, ISD::SETNE);
6994 return DAG.getMergeValues({Result, Overflow}, SL);
6995 }
6996 }
6997
6998 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6999 SDValue Top =
7000 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7001
7002 SDValue Sign = isSigned
7003 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7004 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7005 SL, MVT::i32))
7006 : DAG.getConstant(0, SL, VT);
7007 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7008
7009 return DAG.getMergeValues({Result, Overflow}, SL);
7010}
7011
7012SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7013 if (Op->isDivergent()) {
7014 // Select to V_MAD_[IU]64_[IU]32.
7015 return Op;
7016 }
7017 if (Subtarget->hasSMulHi()) {
7018 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7019 return SDValue();
7020 }
7021 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7022 // calculate the high part, so we might as well do the whole thing with
7023 // V_MAD_[IU]64_[IU]32.
7024 return Op;
7025}
7026
7027SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7028 if (!Subtarget->isTrapHandlerEnabled() ||
7030 return lowerTrapEndpgm(Op, DAG);
7031
7032 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7033 : lowerTrapHsaQueuePtr(Op, DAG);
7034}
7035
7036SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7037 SDLoc SL(Op);
7038 SDValue Chain = Op.getOperand(0);
7039 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7040}
7041
7042SDValue
7043SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7044 const SDLoc &DL, Align Alignment,
7045 ImplicitParameter Param) const {
7048 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7050 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7053}
7054
7055SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7056 SelectionDAG &DAG) const {
7057 SDLoc SL(Op);
7058 SDValue Chain = Op.getOperand(0);
7059
7060 SDValue QueuePtr;
7061 // For code object version 5, QueuePtr is passed through implicit kernarg.
7062 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7064 QueuePtr =
7065 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7066 } else {
7069 Register UserSGPR = Info->getQueuePtrUserSGPR();
7070
7071 if (UserSGPR == AMDGPU::NoRegister) {
7072 // We probably are in a function incorrectly marked with
7073 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7074 // trap, so just use a null pointer.
7075 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7076 } else {
7077 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7078 MVT::i64);
7079 }
7080 }
7081
7082 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7083 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7084
7086 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7087 ToReg.getValue(1)};
7088 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7089}
7090
7091SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7092 SDLoc SL(Op);
7093 SDValue Chain = Op.getOperand(0);
7094
7095 // We need to simulate the 's_trap 2' instruction on targets that run in
7096 // PRIV=1 (where it is treated as a nop).
7097 if (Subtarget->hasPrivEnabledTrap2NopBug())
7098 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7099
7101 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7102 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7103}
7104
7105SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7106 SDLoc SL(Op);
7107 SDValue Chain = Op.getOperand(0);
7109
7110 if (!Subtarget->isTrapHandlerEnabled() ||
7113 "debugtrap handler not supported",
7114 Op.getDebugLoc(), DS_Warning);
7115 LLVMContext &Ctx = MF.getFunction().getContext();
7116 Ctx.diagnose(NoTrap);
7117 return Chain;
7118 }
7119
7120 uint64_t TrapID =
7122 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7123 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7124}
7125
7126SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7127 SelectionDAG &DAG) const {
7128 if (Subtarget->hasApertureRegs()) {
7129 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7130 ? AMDGPU::SRC_SHARED_BASE
7131 : AMDGPU::SRC_PRIVATE_BASE;
7132 // Note: this feature (register) is broken. When used as a 32-bit operand,
7133 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7134 // bits.
7135 //
7136 // To work around the issue, directly emit a 64 bit mov from this register
7137 // then extract the high bits. Note that this shouldn't even result in a
7138 // shift being emitted and simply become a pair of registers (e.g.):
7139 // s_mov_b64 s[6:7], src_shared_base
7140 // v_mov_b32_e32 v1, s7
7141 //
7142 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7143 // coalescing would kick in and it would think it's okay to use the "HI"
7144 // subregister directly (instead of extracting the HI 32 bits) which is an
7145 // artificial (unusable) register.
7146 // Register TableGen definitions would need an overhaul to get rid of the
7147 // artificial "HI" aperture registers and prevent this kind of issue from
7148 // happening.
7149 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7150 DAG.getRegister(ApertureRegNo, MVT::i64));
7151 return DAG.getNode(
7152 ISD::TRUNCATE, DL, MVT::i32,
7153 DAG.getNode(ISD::SRL, DL, MVT::i64,
7154 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7155 }
7156
7157 // For code object version 5, private_base and shared_base are passed through
7158 // implicit kernargs.
7159 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7163 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7164 }
7165
7168 Register UserSGPR = Info->getQueuePtrUserSGPR();
7169 if (UserSGPR == AMDGPU::NoRegister) {
7170 // We probably are in a function incorrectly marked with
7171 // amdgpu-no-queue-ptr. This is undefined.
7172 return DAG.getUNDEF(MVT::i32);
7173 }
7174
7175 SDValue QueuePtr =
7176 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7177
7178 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7179 // private_segment_aperture_base_hi.
7180 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7181
7182 SDValue Ptr =
7183 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7184
7185 // TODO: Use custom target PseudoSourceValue.
7186 // TODO: We should use the value from the IR intrinsic call, but it might not
7187 // be available and how do we get it?
7189 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7190 commonAlignment(Align(64), StructOffset),
7193}
7194
7195/// Return true if the value is a known valid address, such that a null check is
7196/// not necessary.
7198 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7199 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7200 isa<BasicBlockSDNode>(Val))
7201 return true;
7202
7203 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7204 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7205
7206 // TODO: Search through arithmetic, handle arguments and loads
7207 // marked nonnull.
7208 return false;
7209}
7210
7211SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7212 SelectionDAG &DAG) const {
7213 SDLoc SL(Op);
7214
7215 const AMDGPUTargetMachine &TM =
7216 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7217
7218 unsigned DestAS, SrcAS;
7219 SDValue Src;
7220 bool IsNonNull = false;
7221 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7222 SrcAS = ASC->getSrcAddressSpace();
7223 Src = ASC->getOperand(0);
7224 DestAS = ASC->getDestAddressSpace();
7225 } else {
7226 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7227 Op.getConstantOperandVal(0) ==
7228 Intrinsic::amdgcn_addrspacecast_nonnull);
7229 Src = Op->getOperand(1);
7230 SrcAS = Op->getConstantOperandVal(2);
7231 DestAS = Op->getConstantOperandVal(3);
7232 IsNonNull = true;
7233 }
7234
7235 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7236
7237 // flat -> local/private
7238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7239 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7240 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7241 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7242
7243 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7244 return Ptr;
7245
7246 unsigned NullVal = TM.getNullPointerValue(DestAS);
7247 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7248 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7249
7250 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7251 SegmentNullPtr);
7252 }
7253 }
7254
7255 // local/private -> flat
7256 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7257 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7258 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7259
7260 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7261 SDValue CvtPtr =
7262 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7263 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7264
7265 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7266 return CvtPtr;
7267
7268 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7269 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7270
7271 SDValue NonNull =
7272 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7273
7274 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7275 FlatNullPtr);
7276 }
7277 }
7278
7279 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7280 Op.getValueType() == MVT::i64) {
7283 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7284 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7285 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7286 }
7287
7288 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7289 Src.getValueType() == MVT::i64)
7290 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7291
7292 // global <-> flat are no-ops and never emitted.
7293
7294 const MachineFunction &MF = DAG.getMachineFunction();
7295 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7296 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7297 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7298
7299 return DAG.getUNDEF(Op->getValueType(0));
7300}
7301
7302// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7303// the small vector and inserting them into the big vector. That is better than
7304// the default expansion of doing it via a stack slot. Even though the use of
7305// the stack slot would be optimized away afterwards, the stack slot itself
7306// remains.
7307SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7308 SelectionDAG &DAG) const {
7309 SDValue Vec = Op.getOperand(0);
7310 SDValue Ins = Op.getOperand(1);
7311 SDValue Idx = Op.getOperand(2);
7312 EVT VecVT = Vec.getValueType();
7313 EVT InsVT = Ins.getValueType();
7314 EVT EltVT = VecVT.getVectorElementType();
7315 unsigned InsNumElts = InsVT.getVectorNumElements();
7316 unsigned IdxVal = Idx->getAsZExtVal();
7317 SDLoc SL(Op);
7318
7319 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7320 // Insert 32-bit registers at a time.
7321 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7322
7323 unsigned VecNumElts = VecVT.getVectorNumElements();
7324 EVT NewVecVT =
7325 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7326 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7328 MVT::i32, InsNumElts / 2);
7329
7330 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7331 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7332
7333 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7334 SDValue Elt;
7335 if (InsNumElts == 2) {
7336 Elt = Ins;
7337 } else {
7338 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7339 DAG.getConstant(I, SL, MVT::i32));
7340 }
7341 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7342 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7343 }
7344
7345 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7346 }
7347
7348 for (unsigned I = 0; I != InsNumElts; ++I) {
7349 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7350 DAG.getConstant(I, SL, MVT::i32));
7351 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7352 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7353 }
7354 return Vec;
7355}
7356
7357SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7358 SelectionDAG &DAG) const {
7359 SDValue Vec = Op.getOperand(0);
7360 SDValue InsVal = Op.getOperand(1);
7361 SDValue Idx = Op.getOperand(2);
7362 EVT VecVT = Vec.getValueType();
7363 EVT EltVT = VecVT.getVectorElementType();
7364 unsigned VecSize = VecVT.getSizeInBits();
7365 unsigned EltSize = EltVT.getSizeInBits();
7366 SDLoc SL(Op);
7367
7368 // Specially handle the case of v4i16 with static indexing.
7369 unsigned NumElts = VecVT.getVectorNumElements();
7370 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7371 if (NumElts == 4 && EltSize == 16 && KIdx) {
7372 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7373
7374 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7375 DAG.getConstant(0, SL, MVT::i32));
7376 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7377 DAG.getConstant(1, SL, MVT::i32));
7378
7379 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7380 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7381
7382 unsigned Idx = KIdx->getZExtValue();
7383 bool InsertLo = Idx < 2;
7384 SDValue InsHalf = DAG.getNode(
7385 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7386 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7387 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7388
7389 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7390
7391 SDValue Concat =
7392 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7393 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7394
7395 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7396 }
7397
7398 // Static indexing does not lower to stack access, and hence there is no need
7399 // for special custom lowering to avoid stack access.
7400 if (isa<ConstantSDNode>(Idx))
7401 return SDValue();
7402
7403 // Avoid stack access for dynamic indexing by custom lowering to
7404 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7405
7406 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7407
7408 MVT IntVT = MVT::getIntegerVT(VecSize);
7409
7410 // Convert vector index to bit-index and get the required bit mask.
7411 assert(isPowerOf2_32(EltSize));
7412 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7413 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7414 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7415 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7416 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7417
7418 // 1. Create a congruent vector with the target value in each element.
7419 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7420 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7421
7422 // 2. Mask off all other indices except the required index within (1).
7423 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7424
7425 // 3. Mask off the required index within the target vector.
7426 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7427 SDValue RHS =
7428 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7429
7430 // 4. Get (2) and (3) ORed into the target vector.
7431 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7432
7433 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7434}
7435
7436SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7437 SelectionDAG &DAG) const {
7438 SDLoc SL(Op);
7439
7440 EVT ResultVT = Op.getValueType();
7441 SDValue Vec = Op.getOperand(0);
7442 SDValue Idx = Op.getOperand(1);
7443 EVT VecVT = Vec.getValueType();
7444 unsigned VecSize = VecVT.getSizeInBits();
7445 EVT EltVT = VecVT.getVectorElementType();
7446
7447 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7448
7449 // Make sure we do any optimizations that will make it easier to fold
7450 // source modifiers before obscuring it with bit operations.
7451
7452 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7453 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7454 return Combined;
7455
7456 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7457 SDValue Lo, Hi;
7458 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7459
7460 if (VecSize == 128) {
7461 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7462 Lo = DAG.getBitcast(LoVT,
7463 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7464 DAG.getConstant(0, SL, MVT::i32)));
7465 Hi = DAG.getBitcast(HiVT,
7466 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7467 DAG.getConstant(1, SL, MVT::i32)));
7468 } else if (VecSize == 256) {
7469 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7470 SDValue Parts[4];
7471 for (unsigned P = 0; P < 4; ++P) {
7472 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7473 DAG.getConstant(P, SL, MVT::i32));
7474 }
7475
7476 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7477 Parts[0], Parts[1]));
7478 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7479 Parts[2], Parts[3]));
7480 } else {
7481 assert(VecSize == 512);
7482
7483 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7484 SDValue Parts[8];
7485 for (unsigned P = 0; P < 8; ++P) {
7486 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7487 DAG.getConstant(P, SL, MVT::i32));
7488 }
7489
7490 Lo = DAG.getBitcast(LoVT,
7491 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7492 Parts[0], Parts[1], Parts[2], Parts[3]));
7493 Hi = DAG.getBitcast(HiVT,
7494 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7495 Parts[4], Parts[5], Parts[6], Parts[7]));
7496 }
7497
7498 EVT IdxVT = Idx.getValueType();
7499 unsigned NElem = VecVT.getVectorNumElements();
7500 assert(isPowerOf2_32(NElem));
7501 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7502 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7503 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7504 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7505 }
7506
7507 assert(VecSize <= 64);
7508
7509 MVT IntVT = MVT::getIntegerVT(VecSize);
7510
7511 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7512 SDValue VecBC = peekThroughBitcasts(Vec);
7513 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7514 SDValue Src = VecBC.getOperand(0);
7515 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7516 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7517 }
7518
7519 unsigned EltSize = EltVT.getSizeInBits();
7520 assert(isPowerOf2_32(EltSize));
7521
7522 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7523
7524 // Convert vector index to bit-index (* EltSize)
7525 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7526
7527 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7528 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7529
7530 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7531 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7532 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7533 }
7534
7535 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7536}
7537
7538static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7539 assert(Elt % 2 == 0);
7540 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7541}
7542
7543SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7544 SelectionDAG &DAG) const {
7545 SDLoc SL(Op);
7546 EVT ResultVT = Op.getValueType();
7547 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7548
7549 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7550 EVT EltVT = PackVT.getVectorElementType();
7551 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7552
7553 // vector_shuffle <0,1,6,7> lhs, rhs
7554 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7555 //
7556 // vector_shuffle <6,7,2,3> lhs, rhs
7557 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7558 //
7559 // vector_shuffle <6,7,0,1> lhs, rhs
7560 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7561
7562 // Avoid scalarizing when both halves are reading from consecutive elements.
7564 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7565 if (elementPairIsContiguous(SVN->getMask(), I)) {
7566 const int Idx = SVN->getMaskElt(I);
7567 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7568 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7569 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7570 SVN->getOperand(VecIdx),
7571 DAG.getConstant(EltIdx, SL, MVT::i32));
7572 Pieces.push_back(SubVec);
7573 } else {
7574 const int Idx0 = SVN->getMaskElt(I);
7575 const int Idx1 = SVN->getMaskElt(I + 1);
7576 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7577 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7578 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7579 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7580
7581 SDValue Vec0 = SVN->getOperand(VecIdx0);
7582 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7583 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7584
7585 SDValue Vec1 = SVN->getOperand(VecIdx1);
7586 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7587 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7588 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7589 }
7590 }
7591
7592 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7593}
7594
7595SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7596 SelectionDAG &DAG) const {
7597 SDValue SVal = Op.getOperand(0);
7598 EVT ResultVT = Op.getValueType();
7599 EVT SValVT = SVal.getValueType();
7600 SDValue UndefVal = DAG.getUNDEF(SValVT);
7601 SDLoc SL(Op);
7602
7604 VElts.push_back(SVal);
7605 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7606 VElts.push_back(UndefVal);
7607
7608 return DAG.getBuildVector(ResultVT, SL, VElts);
7609}
7610
7611SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7612 SelectionDAG &DAG) const {
7613 SDLoc SL(Op);
7614 EVT VT = Op.getValueType();
7615
7616 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7617 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7618
7619 SDValue Lo = Op.getOperand(0);
7620 SDValue Hi = Op.getOperand(1);
7621
7622 // Avoid adding defined bits with the zero_extend.
7623 if (Hi.isUndef()) {
7624 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7625 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7626 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7627 }
7628
7629 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7630 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7631
7632 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7633 DAG.getConstant(16, SL, MVT::i32));
7634 if (Lo.isUndef())
7635 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7636
7637 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7638 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7639
7640 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7641 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7642 }
7643
7644 // Split into 2-element chunks.
7645 const unsigned NumParts = VT.getVectorNumElements() / 2;
7647 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7648
7650 for (unsigned P = 0; P < NumParts; ++P) {
7651 SDValue Vec = DAG.getBuildVector(
7652 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7653 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7654 }
7655
7656 SDValue Blend =
7657 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7658 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7659}
7660
7662 const GlobalAddressSDNode *GA) const {
7663 // OSes that use ELF REL relocations (instead of RELA) can only store a
7664 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7665 // which can create arbitrary 64-bit addends. (This is only a problem for
7666 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7667 // the high 32 bits of the addend.)
7668 //
7669 // This should be kept in sync with how HasRelocationAddend is initialized in
7670 // the constructor of ELFAMDGPUAsmBackend.
7671 if (!Subtarget->isAmdHsaOS())
7672 return false;
7673
7674 // We can fold offsets for anything that doesn't require a GOT relocation.
7675 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7679}
7680
7681static SDValue
7683 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7684 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7685 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7686 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7687 // lowered to the following code sequence:
7688 //
7689 // For constant address space:
7690 // s_getpc_b64 s[0:1]
7691 // s_add_u32 s0, s0, $symbol
7692 // s_addc_u32 s1, s1, 0
7693 //
7694 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7695 // a fixup or relocation is emitted to replace $symbol with a literal
7696 // constant, which is a pc-relative offset from the encoding of the $symbol
7697 // operand to the global variable.
7698 //
7699 // For global address space:
7700 // s_getpc_b64 s[0:1]
7701 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7702 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7703 //
7704 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7705 // fixups or relocations are emitted to replace $symbol@*@lo and
7706 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7707 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7708 // operand to the global variable.
7709 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7710 SDValue PtrHi;
7711 if (GAFlags == SIInstrInfo::MO_NONE)
7712 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7713 else
7714 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7715 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7716}
7717
7718SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7719 SDValue Op,
7720 SelectionDAG &DAG) const {
7721 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7722 SDLoc DL(GSD);
7723 EVT PtrVT = Op.getValueType();
7724
7725 const GlobalValue *GV = GSD->getGlobal();
7731 GV->hasExternalLinkage()) {
7732 Type *Ty = GV->getValueType();
7733 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7734 // zero-sized type in other languages to declare the dynamic shared
7735 // memory which size is not known at the compile time. They will be
7736 // allocated by the runtime and placed directly after the static
7737 // allocated ones. They all share the same offset.
7738 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7739 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7740 // Adjust alignment for that dynamic shared memory array.
7742 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7743 MFI->setUsesDynamicLDS(true);
7744 return SDValue(
7745 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7746 }
7747 }
7749 }
7750
7752 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7754 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7755 }
7756
7757 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7758 SDValue AddrLo = DAG.getTargetGlobalAddress(
7759 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7760 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7761
7762 SDValue AddrHi = DAG.getTargetGlobalAddress(
7763 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7764 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7765
7766 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7767 }
7768
7769 if (shouldEmitFixup(GV))
7770 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7771
7772 if (shouldEmitPCReloc(GV))
7773 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7775
7776 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7778
7779 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7781 const DataLayout &DataLayout = DAG.getDataLayout();
7782 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7783 MachinePointerInfo PtrInfo =
7785
7786 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7789}
7790
7792 const SDLoc &DL, SDValue V) const {
7793 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7794 // the destination register.
7795 //
7796 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7797 // so we will end up with redundant moves to m0.
7798 //
7799 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7800
7801 // A Null SDValue creates a glue result.
7802 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7803 V, Chain);
7804 return SDValue(M0, 0);
7805}
7806
7807SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7808 MVT VT,
7809 unsigned Offset) const {
7810 SDLoc SL(Op);
7811 SDValue Param = lowerKernargMemParameter(
7812 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7813 // The local size values will have the hi 16-bits as zero.
7814 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7815 DAG.getValueType(VT));
7816}
7817
7819 EVT VT) {
7821 "non-hsa intrinsic with hsa target",
7822 DL.getDebugLoc());
7823 DAG.getContext()->diagnose(BadIntrin);
7824 return DAG.getUNDEF(VT);
7825}
7826
7828 EVT VT) {
7830 "intrinsic not supported on subtarget",
7831 DL.getDebugLoc());
7832 DAG.getContext()->diagnose(BadIntrin);
7833 return DAG.getUNDEF(VT);
7834}
7835
7837 ArrayRef<SDValue> Elts) {
7838 assert(!Elts.empty());
7839 MVT Type;
7840 unsigned NumElts = Elts.size();
7841
7842 if (NumElts <= 12) {
7843 Type = MVT::getVectorVT(MVT::f32, NumElts);
7844 } else {
7845 assert(Elts.size() <= 16);
7846 Type = MVT::v16f32;
7847 NumElts = 16;
7848 }
7849
7850 SmallVector<SDValue, 16> VecElts(NumElts);
7851 for (unsigned i = 0; i < Elts.size(); ++i) {
7852 SDValue Elt = Elts[i];
7853 if (Elt.getValueType() != MVT::f32)
7854 Elt = DAG.getBitcast(MVT::f32, Elt);
7855 VecElts[i] = Elt;
7856 }
7857 for (unsigned i = Elts.size(); i < NumElts; ++i)
7858 VecElts[i] = DAG.getUNDEF(MVT::f32);
7859
7860 if (NumElts == 1)
7861 return VecElts[0];
7862 return DAG.getBuildVector(Type, DL, VecElts);
7863}
7864
7865static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7866 SDValue Src, int ExtraElts) {
7867 EVT SrcVT = Src.getValueType();
7868
7870
7871 if (SrcVT.isVector())
7872 DAG.ExtractVectorElements(Src, Elts);
7873 else
7874 Elts.push_back(Src);
7875
7876 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7877 while (ExtraElts--)
7878 Elts.push_back(Undef);
7879
7880 return DAG.getBuildVector(CastVT, DL, Elts);
7881}
7882
7883// Re-construct the required return value for a image load intrinsic.
7884// This is more complicated due to the optional use TexFailCtrl which means the
7885// required return type is an aggregate
7887 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7888 bool Unpacked, bool IsD16, int DMaskPop,
7889 int NumVDataDwords, bool IsAtomicPacked16Bit,
7890 const SDLoc &DL) {
7891 // Determine the required return type. This is the same regardless of
7892 // IsTexFail flag
7893 EVT ReqRetVT = ResultTypes[0];
7894 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7895 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7896 ? (ReqRetNumElts + 1) / 2
7897 : ReqRetNumElts;
7898
7899 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7900
7901 MVT DataDwordVT =
7902 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7903
7904 MVT MaskPopVT =
7905 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7906
7907 SDValue Data(Result, 0);
7908 SDValue TexFail;
7909
7910 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7911 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7912 if (MaskPopVT.isVector()) {
7913 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7914 SDValue(Result, 0), ZeroIdx);
7915 } else {
7916 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7917 SDValue(Result, 0), ZeroIdx);
7918 }
7919 }
7920
7921 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7922 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7923 NumDataDwords - MaskPopDwords);
7924
7925 if (IsD16)
7926 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7927
7928 EVT LegalReqRetVT = ReqRetVT;
7929 if (!ReqRetVT.isVector()) {
7930 if (!Data.getValueType().isInteger())
7931 Data = DAG.getNode(ISD::BITCAST, DL,
7932 Data.getValueType().changeTypeToInteger(), Data);
7933 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7934 } else {
7935 // We need to widen the return vector to a legal type
7936 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7937 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7938 LegalReqRetVT =
7940 ReqRetVT.getVectorNumElements() + 1);
7941 }
7942 }
7943 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7944
7945 if (IsTexFail) {
7946 TexFail =
7947 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7948 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7949
7950 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7951 }
7952
7953 if (Result->getNumValues() == 1)
7954 return Data;
7955
7956 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7957}
7958
7959static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7960 SDValue *LWE, bool &IsTexFail) {
7961 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7962
7963 uint64_t Value = TexFailCtrlConst->getZExtValue();
7964 if (Value) {
7965 IsTexFail = true;
7966 }
7967
7968 SDLoc DL(TexFailCtrlConst);
7969 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7970 Value &= ~(uint64_t)0x1;
7971 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7972 Value &= ~(uint64_t)0x2;
7973
7974 return Value == 0;
7975}
7976
7978 MVT PackVectorVT,
7979 SmallVectorImpl<SDValue> &PackedAddrs,
7980 unsigned DimIdx, unsigned EndIdx,
7981 unsigned NumGradients) {
7982 SDLoc DL(Op);
7983 for (unsigned I = DimIdx; I < EndIdx; I++) {
7984 SDValue Addr = Op.getOperand(I);
7985
7986 // Gradients are packed with undef for each coordinate.
7987 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7988 // 1D: undef,dx/dh; undef,dx/dv
7989 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7990 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7991 if (((I + 1) >= EndIdx) ||
7992 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7993 I == DimIdx + NumGradients - 1))) {
7994 if (Addr.getValueType() != MVT::i16)
7995 Addr = DAG.getBitcast(MVT::i16, Addr);
7996 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7997 } else {
7998 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7999 I++;
8000 }
8001 Addr = DAG.getBitcast(MVT::f32, Addr);
8002 PackedAddrs.push_back(Addr);
8003 }
8004}
8005
8006SDValue SITargetLowering::lowerImage(SDValue Op,
8008 SelectionDAG &DAG, bool WithChain) const {
8009 SDLoc DL(Op);
8011 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8012 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8014 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8015 unsigned IntrOpcode = Intr->BaseOpcode;
8016 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8017 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8018 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8019
8020 SmallVector<EVT, 3> ResultTypes(Op->values());
8021 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8022 bool IsD16 = false;
8023 bool IsG16 = false;
8024 bool IsA16 = false;
8025 SDValue VData;
8026 int NumVDataDwords = 0;
8027 bool AdjustRetType = false;
8028 bool IsAtomicPacked16Bit = false;
8029
8030 // Offset of intrinsic arguments
8031 const unsigned ArgOffset = WithChain ? 2 : 1;
8032
8033 unsigned DMask;
8034 unsigned DMaskLanes = 0;
8035
8036 if (BaseOpcode->Atomic) {
8037 VData = Op.getOperand(2);
8038
8039 IsAtomicPacked16Bit =
8040 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8041 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8042
8043 bool Is64Bit = VData.getValueSizeInBits() == 64;
8044 if (BaseOpcode->AtomicX2) {
8045 SDValue VData2 = Op.getOperand(3);
8046 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8047 {VData, VData2});
8048 if (Is64Bit)
8049 VData = DAG.getBitcast(MVT::v4i32, VData);
8050
8051 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8052 DMask = Is64Bit ? 0xf : 0x3;
8053 NumVDataDwords = Is64Bit ? 4 : 2;
8054 } else {
8055 DMask = Is64Bit ? 0x3 : 0x1;
8056 NumVDataDwords = Is64Bit ? 2 : 1;
8057 }
8058 } else {
8059 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8060 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8061
8062 if (BaseOpcode->Store) {
8063 VData = Op.getOperand(2);
8064
8065 MVT StoreVT = VData.getSimpleValueType();
8066 if (StoreVT.getScalarType() == MVT::f16) {
8067 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8068 return Op; // D16 is unsupported for this instruction
8069
8070 IsD16 = true;
8071 VData = handleD16VData(VData, DAG, true);
8072 }
8073
8074 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8075 } else if (!BaseOpcode->NoReturn) {
8076 // Work out the num dwords based on the dmask popcount and underlying type
8077 // and whether packing is supported.
8078 MVT LoadVT = ResultTypes[0].getSimpleVT();
8079 if (LoadVT.getScalarType() == MVT::f16) {
8080 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8081 return Op; // D16 is unsupported for this instruction
8082
8083 IsD16 = true;
8084 }
8085
8086 // Confirm that the return type is large enough for the dmask specified
8087 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8088 (!LoadVT.isVector() && DMaskLanes > 1))
8089 return Op;
8090
8091 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8092 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8093 // instructions.
8094 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8095 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8096 NumVDataDwords = (DMaskLanes + 1) / 2;
8097 else
8098 NumVDataDwords = DMaskLanes;
8099
8100 AdjustRetType = true;
8101 }
8102 }
8103
8104 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8106
8107 // Check for 16 bit addresses or derivatives and pack if true.
8108 MVT VAddrVT =
8109 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8110 MVT VAddrScalarVT = VAddrVT.getScalarType();
8111 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8112 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8113
8114 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8115 VAddrScalarVT = VAddrVT.getScalarType();
8116 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8117 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8118
8119 // Push back extra arguments.
8120 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8121 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8122 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8123 // Special handling of bias when A16 is on. Bias is of type half but
8124 // occupies full 32-bit.
8125 SDValue Bias = DAG.getBuildVector(
8126 MVT::v2f16, DL,
8127 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8128 VAddrs.push_back(Bias);
8129 } else {
8130 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8131 "Bias needs to be converted to 16 bit in A16 mode");
8132 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8133 }
8134 }
8135
8136 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8137 // 16 bit gradients are supported, but are tied to the A16 control
8138 // so both gradients and addresses must be 16 bit
8139 LLVM_DEBUG(
8140 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8141 "require 16 bit args for both gradients and addresses");
8142 return Op;
8143 }
8144
8145 if (IsA16) {
8146 if (!ST->hasA16()) {
8147 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8148 "support 16 bit addresses\n");
8149 return Op;
8150 }
8151 }
8152
8153 // We've dealt with incorrect input so we know that if IsA16, IsG16
8154 // are set then we have to compress/pack operands (either address,
8155 // gradient or both)
8156 // In the case where a16 and gradients are tied (no G16 support) then we
8157 // have already verified that both IsA16 and IsG16 are true
8158 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8159 // Activate g16
8160 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8162 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8163 }
8164
8165 // Add gradients (packed or unpacked)
8166 if (IsG16) {
8167 // Pack the gradients
8168 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8169 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8170 ArgOffset + Intr->GradientStart,
8171 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8172 } else {
8173 for (unsigned I = ArgOffset + Intr->GradientStart;
8174 I < ArgOffset + Intr->CoordStart; I++)
8175 VAddrs.push_back(Op.getOperand(I));
8176 }
8177
8178 // Add addresses (packed or unpacked)
8179 if (IsA16) {
8180 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8181 ArgOffset + Intr->CoordStart, VAddrEnd,
8182 0 /* No gradients */);
8183 } else {
8184 // Add uncompressed address
8185 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8186 VAddrs.push_back(Op.getOperand(I));
8187 }
8188
8189 // If the register allocator cannot place the address registers contiguously
8190 // without introducing moves, then using the non-sequential address encoding
8191 // is always preferable, since it saves VALU instructions and is usually a
8192 // wash in terms of code size or even better.
8193 //
8194 // However, we currently have no way of hinting to the register allocator that
8195 // MIMG addresses should be placed contiguously when it is possible to do so,
8196 // so force non-NSA for the common 2-address case as a heuristic.
8197 //
8198 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8199 // allocation when possible.
8200 //
8201 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8202 // set of the remaining addresses.
8203 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8204 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8205 const bool UseNSA = ST->hasNSAEncoding() &&
8206 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8207 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8208 const bool UsePartialNSA =
8209 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8210
8211 SDValue VAddr;
8212 if (UsePartialNSA) {
8213 VAddr = getBuildDwordsVector(DAG, DL,
8214 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8215 } else if (!UseNSA) {
8216 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8217 }
8218
8219 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8220 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8221 SDValue Unorm;
8222 if (!BaseOpcode->Sampler) {
8223 Unorm = True;
8224 } else {
8225 uint64_t UnormConst =
8226 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8227
8228 Unorm = UnormConst ? True : False;
8229 }
8230
8231 SDValue TFE;
8232 SDValue LWE;
8233 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8234 bool IsTexFail = false;
8235 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8236 return Op;
8237
8238 if (IsTexFail) {
8239 if (!DMaskLanes) {
8240 // Expecting to get an error flag since TFC is on - and dmask is 0
8241 // Force dmask to be at least 1 otherwise the instruction will fail
8242 DMask = 0x1;
8243 DMaskLanes = 1;
8244 NumVDataDwords = 1;
8245 }
8246 NumVDataDwords += 1;
8247 AdjustRetType = true;
8248 }
8249
8250 // Has something earlier tagged that the return type needs adjusting
8251 // This happens if the instruction is a load or has set TexFailCtrl flags
8252 if (AdjustRetType) {
8253 // NumVDataDwords reflects the true number of dwords required in the return
8254 // type
8255 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8256 // This is a no-op load. This can be eliminated
8257 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8258 if (isa<MemSDNode>(Op))
8259 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8260 return Undef;
8261 }
8262
8263 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8264 MVT::i32, NumVDataDwords)
8265 : MVT::i32;
8266
8267 ResultTypes[0] = NewVT;
8268 if (ResultTypes.size() == 3) {
8269 // Original result was aggregate type used for TexFailCtrl results
8270 // The actual instruction returns as a vector type which has now been
8271 // created. Remove the aggregate result.
8272 ResultTypes.erase(&ResultTypes[1]);
8273 }
8274 }
8275
8276 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8277 if (BaseOpcode->Atomic)
8278 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8279 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8281 return Op;
8282
8284 if (BaseOpcode->Store || BaseOpcode->Atomic)
8285 Ops.push_back(VData); // vdata
8286 if (UsePartialNSA) {
8287 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8288 Ops.push_back(VAddr);
8289 } else if (UseNSA)
8290 append_range(Ops, VAddrs);
8291 else
8292 Ops.push_back(VAddr);
8293 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8294 EVT RsrcVT = Rsrc.getValueType();
8295 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8296 return Op;
8297 Ops.push_back(Rsrc);
8298 if (BaseOpcode->Sampler) {
8299 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8300 if (Samp.getValueType() != MVT::v4i32)
8301 return Op;
8302 Ops.push_back(Samp);
8303 }
8304 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8305 if (IsGFX10Plus)
8306 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8307 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8308 Ops.push_back(Unorm);
8309 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8310 Ops.push_back(IsA16 && // r128, a16 for gfx9
8311 ST->hasFeature(AMDGPU::FeatureR128A16)
8312 ? True
8313 : False);
8314 if (IsGFX10Plus)
8315 Ops.push_back(IsA16 ? True : False);
8316 if (!Subtarget->hasGFX90AInsts()) {
8317 Ops.push_back(TFE); // tfe
8318 } else if (TFE->getAsZExtVal()) {
8319 report_fatal_error("TFE is not supported on this GPU");
8320 }
8321 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8322 Ops.push_back(LWE); // lwe
8323 if (!IsGFX10Plus)
8324 Ops.push_back(DimInfo->DA ? True : False);
8325 if (BaseOpcode->HasD16)
8326 Ops.push_back(IsD16 ? True : False);
8327 if (isa<MemSDNode>(Op))
8328 Ops.push_back(Op.getOperand(0)); // chain
8329
8330 int NumVAddrDwords =
8331 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8332 int Opcode = -1;
8333
8334 if (IsGFX12Plus) {
8335 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8336 NumVDataDwords, NumVAddrDwords);
8337 } else if (IsGFX11Plus) {
8338 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8339 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8340 : AMDGPU::MIMGEncGfx11Default,
8341 NumVDataDwords, NumVAddrDwords);
8342 } else if (IsGFX10Plus) {
8343 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8344 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8345 : AMDGPU::MIMGEncGfx10Default,
8346 NumVDataDwords, NumVAddrDwords);
8347 } else {
8348 if (Subtarget->hasGFX90AInsts()) {
8349 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8350 NumVDataDwords, NumVAddrDwords);
8351 if (Opcode == -1)
8353 "requested image instruction is not supported on this GPU");
8354 }
8355 if (Opcode == -1 &&
8357 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8358 NumVDataDwords, NumVAddrDwords);
8359 if (Opcode == -1)
8360 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8361 NumVDataDwords, NumVAddrDwords);
8362 }
8363 if (Opcode == -1)
8364 return Op;
8365
8366 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8367 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8368 MachineMemOperand *MemRef = MemOp->getMemOperand();
8369 DAG.setNodeMemRefs(NewNode, {MemRef});
8370 }
8371
8372 if (BaseOpcode->AtomicX2) {
8374 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8375 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8376 }
8377 if (BaseOpcode->NoReturn)
8378 return SDValue(NewNode, 0);
8379 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8380 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8381 NumVDataDwords, IsAtomicPacked16Bit, DL);
8382}
8383
8384SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8385 SDValue Offset, SDValue CachePolicy,
8386 SelectionDAG &DAG) const {
8388
8389 const DataLayout &DataLayout = DAG.getDataLayout();
8390 Align Alignment =
8392
8397 VT.getStoreSize(), Alignment);
8398
8399 if (!Offset->isDivergent()) {
8400 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8401
8402 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8403 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8404 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8405 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8406 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8407 SDValue BufferLoad =
8409 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8410 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8411 }
8412
8413 // Widen vec3 load to vec4.
8414 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8415 !Subtarget->hasScalarDwordx3Loads()) {
8416 EVT WidenedVT =
8418 auto WidenedOp = DAG.getMemIntrinsicNode(
8419 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8420 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8421 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8422 DAG.getVectorIdxConstant(0, DL));
8423 return Subvector;
8424 }
8425
8427 DAG.getVTList(VT), Ops, VT, MMO);
8428 }
8429
8430 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8431 // assume that the buffer is unswizzled.
8432 SDValue Ops[] = {
8433 DAG.getEntryNode(), // Chain
8434 Rsrc, // rsrc
8435 DAG.getConstant(0, DL, MVT::i32), // vindex
8436 {}, // voffset
8437 {}, // soffset
8438 {}, // offset
8439 CachePolicy, // cachepolicy
8440 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8441 };
8442 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8443 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8444 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8445 }
8446
8448 unsigned NumLoads = 1;
8449 MVT LoadVT = VT.getSimpleVT();
8450 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8451 assert((LoadVT.getScalarType() == MVT::i32 ||
8452 LoadVT.getScalarType() == MVT::f32));
8453
8454 if (NumElts == 8 || NumElts == 16) {
8455 NumLoads = NumElts / 4;
8456 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8457 }
8458
8459 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8460
8461 // Use the alignment to ensure that the required offsets will fit into the
8462 // immediate offsets.
8463 setBufferOffsets(Offset, DAG, &Ops[3],
8464 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8465
8466 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8467 for (unsigned i = 0; i < NumLoads; ++i) {
8468 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8469 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8470 LoadVT, MMO, DAG));
8471 }
8472
8473 if (NumElts == 8 || NumElts == 16)
8474 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8475
8476 return Loads[0];
8477}
8478
8479SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8480 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8481 if (!Subtarget->hasArchitectedSGPRs())
8482 return {};
8483 SDLoc SL(Op);
8484 MVT VT = MVT::i32;
8485 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8486 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8487 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8488}
8489
8490SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8491 unsigned Dim,
8492 const ArgDescriptor &Arg) const {
8493 SDLoc SL(Op);
8495 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8496 if (MaxID == 0)
8497 return DAG.getConstant(0, SL, MVT::i32);
8498
8499 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8500 SDLoc(DAG.getEntryNode()), Arg);
8501
8502 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8503 // masking operations anyway.
8504 //
8505 // TODO: We could assert the top bit is 0 for the source copy.
8506 if (Arg.isMasked())
8507 return Val;
8508
8509 // Preserve the known bits after expansion to a copy.
8511 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8512 DAG.getValueType(SmallVT));
8513}
8514
8515SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8516 SelectionDAG &DAG) const {
8518 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8519
8520 EVT VT = Op.getValueType();
8521 SDLoc DL(Op);
8522 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8523
8524 // TODO: Should this propagate fast-math-flags?
8525
8526 switch (IntrinsicID) {
8527 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8528 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8529 return emitNonHSAIntrinsicError(DAG, DL, VT);
8530 return getPreloadedValue(DAG, *MFI, VT,
8532 }
8533 case Intrinsic::amdgcn_dispatch_ptr:
8534 case Intrinsic::amdgcn_queue_ptr: {
8535 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8536 DiagnosticInfoUnsupported BadIntrin(
8537 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8538 DL.getDebugLoc());
8539 DAG.getContext()->diagnose(BadIntrin);
8540 return DAG.getUNDEF(VT);
8541 }
8542
8543 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8546 return getPreloadedValue(DAG, *MFI, VT, RegID);
8547 }
8548 case Intrinsic::amdgcn_implicitarg_ptr: {
8549 if (MFI->isEntryFunction())
8550 return getImplicitArgPtr(DAG, DL);
8551 return getPreloadedValue(DAG, *MFI, VT,
8553 }
8554 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8556 // This only makes sense to call in a kernel, so just lower to null.
8557 return DAG.getConstant(0, DL, VT);
8558 }
8559
8560 return getPreloadedValue(DAG, *MFI, VT,
8562 }
8563 case Intrinsic::amdgcn_dispatch_id: {
8564 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8565 }
8566 case Intrinsic::amdgcn_rcp:
8567 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8568 case Intrinsic::amdgcn_rsq:
8569 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8570 case Intrinsic::amdgcn_rsq_legacy:
8572 return emitRemovedIntrinsicError(DAG, DL, VT);
8573 return SDValue();
8574 case Intrinsic::amdgcn_rcp_legacy:
8576 return emitRemovedIntrinsicError(DAG, DL, VT);
8577 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8578 case Intrinsic::amdgcn_rsq_clamp: {
8580 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8581
8582 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8585
8586 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8587 SDValue Tmp =
8588 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8589 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8590 DAG.getConstantFP(Min, DL, VT));
8591 }
8592 case Intrinsic::r600_read_ngroups_x:
8593 if (Subtarget->isAmdHsaOS())
8594 return emitNonHSAIntrinsicError(DAG, DL, VT);
8595
8596 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8598 false);
8599 case Intrinsic::r600_read_ngroups_y:
8600 if (Subtarget->isAmdHsaOS())
8601 return emitNonHSAIntrinsicError(DAG, DL, VT);
8602
8603 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8605 false);
8606 case Intrinsic::r600_read_ngroups_z:
8607 if (Subtarget->isAmdHsaOS())
8608 return emitNonHSAIntrinsicError(DAG, DL, VT);
8609
8610 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8612 false);
8613 case Intrinsic::r600_read_global_size_x:
8614 if (Subtarget->isAmdHsaOS())
8615 return emitNonHSAIntrinsicError(DAG, DL, VT);
8616
8617 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8619 Align(4), false);
8620 case Intrinsic::r600_read_global_size_y:
8621 if (Subtarget->isAmdHsaOS())
8622 return emitNonHSAIntrinsicError(DAG, DL, VT);
8623
8624 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8626 Align(4), false);
8627 case Intrinsic::r600_read_global_size_z:
8628 if (Subtarget->isAmdHsaOS())
8629 return emitNonHSAIntrinsicError(DAG, DL, VT);
8630
8631 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8633 Align(4), false);
8634 case Intrinsic::r600_read_local_size_x:
8635 if (Subtarget->isAmdHsaOS())
8636 return emitNonHSAIntrinsicError(DAG, DL, VT);
8637
8638 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8640 case Intrinsic::r600_read_local_size_y:
8641 if (Subtarget->isAmdHsaOS())
8642 return emitNonHSAIntrinsicError(DAG, DL, VT);
8643
8644 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8646 case Intrinsic::r600_read_local_size_z:
8647 if (Subtarget->isAmdHsaOS())
8648 return emitNonHSAIntrinsicError(DAG, DL, VT);
8649
8650 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8652 case Intrinsic::amdgcn_workgroup_id_x:
8653 return getPreloadedValue(DAG, *MFI, VT,
8655 case Intrinsic::amdgcn_workgroup_id_y:
8656 return getPreloadedValue(DAG, *MFI, VT,
8658 case Intrinsic::amdgcn_workgroup_id_z:
8659 return getPreloadedValue(DAG, *MFI, VT,
8661 case Intrinsic::amdgcn_wave_id:
8662 return lowerWaveID(DAG, Op);
8663 case Intrinsic::amdgcn_lds_kernel_id: {
8664 if (MFI->isEntryFunction())
8665 return getLDSKernelId(DAG, DL);
8666 return getPreloadedValue(DAG, *MFI, VT,
8668 }
8669 case Intrinsic::amdgcn_workitem_id_x:
8670 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8671 case Intrinsic::amdgcn_workitem_id_y:
8672 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8673 case Intrinsic::amdgcn_workitem_id_z:
8674 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8675 case Intrinsic::amdgcn_wavefrontsize:
8677 SDLoc(Op), MVT::i32);
8678 case Intrinsic::amdgcn_s_buffer_load: {
8679 unsigned CPol = Op.getConstantOperandVal(3);
8680 // s_buffer_load, because of how it's optimized, can't be volatile
8681 // so reject ones with the volatile bit set.
8682 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8685 return Op;
8686 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8687 Op.getOperand(3), DAG);
8688 }
8689 case Intrinsic::amdgcn_fdiv_fast:
8690 return lowerFDIV_FAST(Op, DAG);
8691 case Intrinsic::amdgcn_sin:
8692 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8693
8694 case Intrinsic::amdgcn_cos:
8695 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8696
8697 case Intrinsic::amdgcn_mul_u24:
8698 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8699 Op.getOperand(2));
8700 case Intrinsic::amdgcn_mul_i24:
8701 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8702 Op.getOperand(2));
8703
8704 case Intrinsic::amdgcn_log_clamp: {
8706 return SDValue();
8707
8708 return emitRemovedIntrinsicError(DAG, DL, VT);
8709 }
8710 case Intrinsic::amdgcn_fract:
8711 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8712
8713 case Intrinsic::amdgcn_class:
8714 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8715 Op.getOperand(2));
8716 case Intrinsic::amdgcn_div_fmas:
8717 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8718 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8719
8720 case Intrinsic::amdgcn_div_fixup:
8721 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8722 Op.getOperand(2), Op.getOperand(3));
8723
8724 case Intrinsic::amdgcn_div_scale: {
8725 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8726
8727 // Translate to the operands expected by the machine instruction. The
8728 // first parameter must be the same as the first instruction.
8729 SDValue Numerator = Op.getOperand(1);
8730 SDValue Denominator = Op.getOperand(2);
8731
8732 // Note this order is opposite of the machine instruction's operations,
8733 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8734 // intrinsic has the numerator as the first operand to match a normal
8735 // division operation.
8736
8737 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8738
8739 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8740 Denominator, Numerator);
8741 }
8742 case Intrinsic::amdgcn_icmp: {
8743 // There is a Pat that handles this variant, so return it as-is.
8744 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8745 Op.getConstantOperandVal(2) == 0 &&
8746 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8747 return Op;
8748 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8749 }
8750 case Intrinsic::amdgcn_fcmp: {
8751 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8752 }
8753 case Intrinsic::amdgcn_ballot:
8754 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8755 case Intrinsic::amdgcn_fmed3:
8756 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8757 Op.getOperand(2), Op.getOperand(3));
8758 case Intrinsic::amdgcn_fdot2:
8759 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8760 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8761 case Intrinsic::amdgcn_fmul_legacy:
8762 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8763 Op.getOperand(2));
8764 case Intrinsic::amdgcn_sffbh:
8765 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8766 case Intrinsic::amdgcn_sbfe:
8767 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8768 Op.getOperand(2), Op.getOperand(3));
8769 case Intrinsic::amdgcn_ubfe:
8770 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8771 Op.getOperand(2), Op.getOperand(3));
8772 case Intrinsic::amdgcn_cvt_pkrtz:
8773 case Intrinsic::amdgcn_cvt_pknorm_i16:
8774 case Intrinsic::amdgcn_cvt_pknorm_u16:
8775 case Intrinsic::amdgcn_cvt_pk_i16:
8776 case Intrinsic::amdgcn_cvt_pk_u16: {
8777 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8778 EVT VT = Op.getValueType();
8779 unsigned Opcode;
8780
8781 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8783 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8785 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8787 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8789 else
8791
8792 if (isTypeLegal(VT))
8793 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8794
8795 SDValue Node =
8796 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8797 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8798 }
8799 case Intrinsic::amdgcn_fmad_ftz:
8800 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8801 Op.getOperand(2), Op.getOperand(3));
8802
8803 case Intrinsic::amdgcn_if_break:
8804 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8805 Op->getOperand(1), Op->getOperand(2)),
8806 0);
8807
8808 case Intrinsic::amdgcn_groupstaticsize: {
8810 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8811 return Op;
8812
8813 const Module *M = MF.getFunction().getParent();
8814 const GlobalValue *GV =
8815 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8816 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8818 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8819 }
8820 case Intrinsic::amdgcn_is_shared:
8821 case Intrinsic::amdgcn_is_private: {
8822 SDLoc SL(Op);
8823 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8826 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8827 SDValue SrcVec =
8828 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8829
8830 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8831 DAG.getConstant(1, SL, MVT::i32));
8832 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8833 }
8834 case Intrinsic::amdgcn_perm:
8835 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8836 Op.getOperand(2), Op.getOperand(3));
8837 case Intrinsic::amdgcn_reloc_constant: {
8838 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8839 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8840 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8841 auto *RelocSymbol = cast<GlobalVariable>(
8842 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8843 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8845 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8846 }
8847 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8848 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8849 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8850 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8851 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8852 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8853 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8854 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8855 if (Op.getOperand(4).getValueType() == MVT::i32)
8856 return SDValue();
8857
8858 SDLoc SL(Op);
8859 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8860 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8861 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8862 Op.getOperand(3), IndexKeyi32);
8863 }
8864 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8865 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8866 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8867 if (Op.getOperand(6).getValueType() == MVT::i32)
8868 return SDValue();
8869
8870 SDLoc SL(Op);
8871 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8872 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8873 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8874 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8875 IndexKeyi32, Op.getOperand(7)});
8876 }
8877 case Intrinsic::amdgcn_addrspacecast_nonnull:
8878 return lowerADDRSPACECAST(Op, DAG);
8879 case Intrinsic::amdgcn_readlane:
8880 case Intrinsic::amdgcn_readfirstlane:
8881 case Intrinsic::amdgcn_writelane:
8882 case Intrinsic::amdgcn_permlane16:
8883 case Intrinsic::amdgcn_permlanex16:
8884 case Intrinsic::amdgcn_permlane64:
8885 case Intrinsic::amdgcn_set_inactive:
8886 case Intrinsic::amdgcn_set_inactive_chain_arg:
8887 case Intrinsic::amdgcn_mov_dpp8:
8888 case Intrinsic::amdgcn_update_dpp:
8889 return lowerLaneOp(*this, Op.getNode(), DAG);
8890 default:
8891 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8893 return lowerImage(Op, ImageDimIntr, DAG, false);
8894
8895 return Op;
8896 }
8897}
8898
8899// On targets not supporting constant in soffset field, turn zero to
8900// SGPR_NULL to avoid generating an extra s_mov with zero.
8902 const GCNSubtarget *Subtarget) {
8903 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8904 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8905 return SOffset;
8906}
8907
8908SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8909 SelectionDAG &DAG,
8910 unsigned NewOpcode) const {
8911 SDLoc DL(Op);
8912
8913 SDValue VData = Op.getOperand(2);
8914 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8915 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
8916 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8917 SDValue Ops[] = {
8918 Op.getOperand(0), // Chain
8919 VData, // vdata
8920 Rsrc, // rsrc
8921 DAG.getConstant(0, DL, MVT::i32), // vindex
8922 VOffset, // voffset
8923 SOffset, // soffset
8924 Offset, // offset
8925 Op.getOperand(6), // cachepolicy
8926 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8927 };
8928
8929 auto *M = cast<MemSDNode>(Op);
8930
8931 EVT MemVT = VData.getValueType();
8932 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8933 M->getMemOperand());
8934}
8935
8936SDValue
8937SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8938 unsigned NewOpcode) const {
8939 SDLoc DL(Op);
8940
8941 SDValue VData = Op.getOperand(2);
8942 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8943 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
8944 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8945 SDValue Ops[] = {
8946 Op.getOperand(0), // Chain
8947 VData, // vdata
8948 Rsrc, // rsrc
8949 Op.getOperand(4), // vindex
8950 VOffset, // voffset
8951 SOffset, // soffset
8952 Offset, // offset
8953 Op.getOperand(7), // cachepolicy
8954 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8955 };
8956
8957 auto *M = cast<MemSDNode>(Op);
8958
8959 EVT MemVT = VData.getValueType();
8960 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8961 M->getMemOperand());
8962}
8963
8964SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8965 SelectionDAG &DAG) const {
8966 unsigned IntrID = Op.getConstantOperandVal(1);
8967 SDLoc DL(Op);
8968
8969 switch (IntrID) {
8970 case Intrinsic::amdgcn_ds_ordered_add:
8971 case Intrinsic::amdgcn_ds_ordered_swap: {
8972 MemSDNode *M = cast<MemSDNode>(Op);
8973 SDValue Chain = M->getOperand(0);
8974 SDValue M0 = M->getOperand(2);
8975 SDValue Value = M->getOperand(3);
8976 unsigned IndexOperand = M->getConstantOperandVal(7);
8977 unsigned WaveRelease = M->getConstantOperandVal(8);
8978 unsigned WaveDone = M->getConstantOperandVal(9);
8979
8980 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8981 IndexOperand &= ~0x3f;
8982 unsigned CountDw = 0;
8983
8984 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8985 CountDw = (IndexOperand >> 24) & 0xf;
8986 IndexOperand &= ~(0xf << 24);
8987
8988 if (CountDw < 1 || CountDw > 4) {
8990 "ds_ordered_count: dword count must be between 1 and 4");
8991 }
8992 }
8993
8994 if (IndexOperand)
8995 report_fatal_error("ds_ordered_count: bad index operand");
8996
8997 if (WaveDone && !WaveRelease)
8998 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8999
9000 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9001 unsigned ShaderType =
9003 unsigned Offset0 = OrderedCountIndex << 2;
9004 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9005
9006 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9007 Offset1 |= (CountDw - 1) << 6;
9008
9009 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9010 Offset1 |= ShaderType << 2;
9011
9012 unsigned Offset = Offset0 | (Offset1 << 8);
9013
9014 SDValue Ops[] = {
9015 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9016 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9017 };
9019 M->getVTList(), Ops, M->getMemoryVT(),
9020 M->getMemOperand());
9021 }
9022 case Intrinsic::amdgcn_raw_buffer_load:
9023 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9024 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9025 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9026 case Intrinsic::amdgcn_raw_buffer_load_format:
9027 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9028 const bool IsFormat =
9029 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9030 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9031
9032 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9033 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9034 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9035 SDValue Ops[] = {
9036 Op.getOperand(0), // Chain
9037 Rsrc, // rsrc
9038 DAG.getConstant(0, DL, MVT::i32), // vindex
9039 VOffset, // voffset
9040 SOffset, // soffset
9041 Offset, // offset
9042 Op.getOperand(5), // cachepolicy, swizzled buffer
9043 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9044 };
9045
9046 auto *M = cast<MemSDNode>(Op);
9047 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9048 }
9049 case Intrinsic::amdgcn_struct_buffer_load:
9050 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9051 case Intrinsic::amdgcn_struct_buffer_load_format:
9052 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9053 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9054 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9055 const bool IsFormat =
9056 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9057 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9058
9059 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9060 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9061 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9062 SDValue Ops[] = {
9063 Op.getOperand(0), // Chain
9064 Rsrc, // rsrc
9065 Op.getOperand(3), // vindex
9066 VOffset, // voffset
9067 SOffset, // soffset
9068 Offset, // offset
9069 Op.getOperand(6), // cachepolicy, swizzled buffer
9070 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9071 };
9072
9073 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9074 }
9075 case Intrinsic::amdgcn_raw_tbuffer_load:
9076 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9077 MemSDNode *M = cast<MemSDNode>(Op);
9078 EVT LoadVT = Op.getValueType();
9079 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9080 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9081 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9082
9083 SDValue Ops[] = {
9084 Op.getOperand(0), // Chain
9085 Rsrc, // rsrc
9086 DAG.getConstant(0, DL, MVT::i32), // vindex
9087 VOffset, // voffset
9088 SOffset, // soffset
9089 Offset, // offset
9090 Op.getOperand(5), // format
9091 Op.getOperand(6), // cachepolicy, swizzled buffer
9092 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9093 };
9094
9095 if (LoadVT.getScalarType() == MVT::f16)
9096 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9097 Ops);
9098 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9099 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9100 DAG);
9101 }
9102 case Intrinsic::amdgcn_struct_tbuffer_load:
9103 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9104 MemSDNode *M = cast<MemSDNode>(Op);
9105 EVT LoadVT = Op.getValueType();
9106 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9107 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9108 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9109
9110 SDValue Ops[] = {
9111 Op.getOperand(0), // Chain
9112 Rsrc, // rsrc
9113 Op.getOperand(3), // vindex
9114 VOffset, // voffset
9115 SOffset, // soffset
9116 Offset, // offset
9117 Op.getOperand(6), // format
9118 Op.getOperand(7), // cachepolicy, swizzled buffer
9119 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9120 };
9121
9122 if (LoadVT.getScalarType() == MVT::f16)
9123 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9124 Ops);
9125 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9126 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9127 DAG);
9128 }
9129 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9130 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9131 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9132 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9134 return lowerStructBufferAtomicIntrin(Op, DAG,
9136 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9137 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9138 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9139 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9140 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9141 return lowerStructBufferAtomicIntrin(Op, DAG,
9143 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9144 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9145 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9146 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9147 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9148 return lowerStructBufferAtomicIntrin(Op, DAG,
9150 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9152 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9153 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9154 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9155 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9156 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9157 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9158 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9159 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9160 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9161 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9162 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9163 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9164 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9165 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9166 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9167 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9168 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9170 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9171 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9172 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9173 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9174 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9175 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9176 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9177 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9178 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9179 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9180 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9181 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9182 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9183 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9184 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9185 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9186 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9187 return lowerRawBufferAtomicIntrin(Op, DAG,
9189 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9190 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9191 return lowerStructBufferAtomicIntrin(Op, DAG,
9193 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9194 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9195 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9196 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9197 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9198 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9199 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9200 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9201 return lowerStructBufferAtomicIntrin(Op, DAG,
9203 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9204 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9205 return lowerStructBufferAtomicIntrin(Op, DAG,
9207 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9209 return lowerStructBufferAtomicIntrin(Op, DAG,
9211 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9212 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9213 return lowerStructBufferAtomicIntrin(Op, DAG,
9215 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9217 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9218 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9219 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9220 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9221 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9222 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9223 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9224 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9225 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9226 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9227 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9229 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9230 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9231 return lowerStructBufferAtomicIntrin(Op, DAG,
9233
9234 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9235 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9236 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9237 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9238 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9239 SDValue Ops[] = {
9240 Op.getOperand(0), // Chain
9241 Op.getOperand(2), // src
9242 Op.getOperand(3), // cmp
9243 Rsrc, // rsrc
9244 DAG.getConstant(0, DL, MVT::i32), // vindex
9245 VOffset, // voffset
9246 SOffset, // soffset
9247 Offset, // offset
9248 Op.getOperand(7), // cachepolicy
9249 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9250 };
9251 EVT VT = Op.getValueType();
9252 auto *M = cast<MemSDNode>(Op);
9253
9255 Op->getVTList(), Ops, VT,
9256 M->getMemOperand());
9257 }
9258 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9260 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9261 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9262 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9263 SDValue Ops[] = {
9264 Op.getOperand(0), // Chain
9265 Op.getOperand(2), // src
9266 Op.getOperand(3), // cmp
9267 Rsrc, // rsrc
9268 Op.getOperand(5), // vindex
9269 VOffset, // voffset
9270 SOffset, // soffset
9271 Offset, // offset
9272 Op.getOperand(8), // cachepolicy
9273 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9274 };
9275 EVT VT = Op.getValueType();
9276 auto *M = cast<MemSDNode>(Op);
9277
9279 Op->getVTList(), Ops, VT,
9280 M->getMemOperand());
9281 }
9282 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9283 MemSDNode *M = cast<MemSDNode>(Op);
9284 SDValue NodePtr = M->getOperand(2);
9285 SDValue RayExtent = M->getOperand(3);
9286 SDValue RayOrigin = M->getOperand(4);
9287 SDValue RayDir = M->getOperand(5);
9288 SDValue RayInvDir = M->getOperand(6);
9289 SDValue TDescr = M->getOperand(7);
9290
9291 assert(NodePtr.getValueType() == MVT::i32 ||
9292 NodePtr.getValueType() == MVT::i64);
9293 assert(RayDir.getValueType() == MVT::v3f16 ||
9294 RayDir.getValueType() == MVT::v3f32);
9295
9296 if (!Subtarget->hasGFX10_AEncoding()) {
9297 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9298 return SDValue();
9299 }
9300
9301 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9302 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9303 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9304 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9305 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9306 const unsigned NumVDataDwords = 4;
9307 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9308 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9309 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9310 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9311 IsGFX12Plus;
9312 const unsigned BaseOpcodes[2][2] = {
9313 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9314 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9315 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9316 int Opcode;
9317 if (UseNSA) {
9318 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9319 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9320 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9321 : AMDGPU::MIMGEncGfx10NSA,
9322 NumVDataDwords, NumVAddrDwords);
9323 } else {
9324 assert(!IsGFX12Plus);
9325 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9326 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9327 : AMDGPU::MIMGEncGfx10Default,
9328 NumVDataDwords, NumVAddrDwords);
9329 }
9330 assert(Opcode != -1);
9331
9333
9334 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9336 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9337 if (Lanes[0].getValueSizeInBits() == 32) {
9338 for (unsigned I = 0; I < 3; ++I)
9339 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9340 } else {
9341 if (IsAligned) {
9342 Ops.push_back(DAG.getBitcast(
9343 MVT::i32,
9344 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9345 Ops.push_back(Lanes[2]);
9346 } else {
9347 SDValue Elt0 = Ops.pop_back_val();
9348 Ops.push_back(DAG.getBitcast(
9349 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9350 Ops.push_back(DAG.getBitcast(
9351 MVT::i32,
9352 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9353 }
9354 }
9355 };
9356
9357 if (UseNSA && IsGFX11Plus) {
9358 Ops.push_back(NodePtr);
9359 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9360 Ops.push_back(RayOrigin);
9361 if (IsA16) {
9362 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9363 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9364 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9365 for (unsigned I = 0; I < 3; ++I) {
9366 MergedLanes.push_back(DAG.getBitcast(
9367 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9368 {DirLanes[I], InvDirLanes[I]})));
9369 }
9370 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9371 } else {
9372 Ops.push_back(RayDir);
9373 Ops.push_back(RayInvDir);
9374 }
9375 } else {
9376 if (Is64)
9377 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9378 2);
9379 else
9380 Ops.push_back(NodePtr);
9381
9382 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9383 packLanes(RayOrigin, true);
9384 packLanes(RayDir, true);
9385 packLanes(RayInvDir, false);
9386 }
9387
9388 if (!UseNSA) {
9389 // Build a single vector containing all the operands so far prepared.
9390 if (NumVAddrDwords > 12) {
9391 SDValue Undef = DAG.getUNDEF(MVT::i32);
9392 Ops.append(16 - Ops.size(), Undef);
9393 }
9394 assert(Ops.size() >= 8 && Ops.size() <= 12);
9395 SDValue MergedOps =
9396 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9397 Ops.clear();
9398 Ops.push_back(MergedOps);
9399 }
9400
9401 Ops.push_back(TDescr);
9402 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9403 Ops.push_back(M->getChain());
9404
9405 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9406 MachineMemOperand *MemRef = M->getMemOperand();
9407 DAG.setNodeMemRefs(NewNode, {MemRef});
9408 return SDValue(NewNode, 0);
9409 }
9410 case Intrinsic::amdgcn_global_atomic_fmin_num:
9411 case Intrinsic::amdgcn_global_atomic_fmax_num:
9412 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9413 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9414 MemSDNode *M = cast<MemSDNode>(Op);
9415 SDValue Ops[] = {
9416 M->getOperand(0), // Chain
9417 M->getOperand(2), // Ptr
9418 M->getOperand(3) // Value
9419 };
9420 unsigned Opcode = 0;
9421 switch (IntrID) {
9422 case Intrinsic::amdgcn_global_atomic_fmin_num:
9423 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9424 Opcode = ISD::ATOMIC_LOAD_FMIN;
9425 break;
9426 }
9427 case Intrinsic::amdgcn_global_atomic_fmax_num:
9428 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9429 Opcode = ISD::ATOMIC_LOAD_FMAX;
9430 break;
9431 }
9432 default:
9433 llvm_unreachable("unhandled atomic opcode");
9434 }
9435 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9436 Ops, M->getMemOperand());
9437 }
9438 case Intrinsic::amdgcn_s_get_barrier_state:
9439 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9440 SDValue Chain = Op->getOperand(0);
9442 unsigned Opc;
9443
9444 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9445 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9446 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9447 BarID = (BarID >> 4) & 0x3F;
9448 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9449 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9450 Ops.push_back(K);
9451 Ops.push_back(Chain);
9452 } else {
9453 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9454 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9455 SDValue M0Val;
9456 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9457 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9458 M0Val = SDValue(
9459 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9460 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9461 0);
9462 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9463 } else
9464 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9465 }
9466
9467 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9468 return SDValue(NewMI, 0);
9469 }
9470 default:
9471
9472 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9474 return lowerImage(Op, ImageDimIntr, DAG, true);
9475
9476 return SDValue();
9477 }
9478}
9479
9480// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9481// dwordx4 if on SI and handle TFE loads.
9482SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9483 SDVTList VTList,
9484 ArrayRef<SDValue> Ops, EVT MemVT,
9485 MachineMemOperand *MMO,
9486 SelectionDAG &DAG) const {
9487 LLVMContext &C = *DAG.getContext();
9489 EVT VT = VTList.VTs[0];
9490
9491 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9492 bool IsTFE = VTList.NumVTs == 3;
9493 if (IsTFE) {
9494 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9495 unsigned NumOpDWords = NumValueDWords + 1;
9496 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9497 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9498 MachineMemOperand *OpDWordsMMO =
9499 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9500 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9501 OpDWordsVT, OpDWordsMMO, DAG);
9503 DAG.getVectorIdxConstant(NumValueDWords, DL));
9504 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9505 SDValue ValueDWords =
9506 NumValueDWords == 1
9507 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9509 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9510 ZeroIdx);
9511 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9512 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9513 }
9514
9515 if (!Subtarget->hasDwordx3LoadStores() &&
9516 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9517 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9518 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9519 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9520 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9521 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9522 WidenedMemVT, WidenedMMO);
9524 DAG.getVectorIdxConstant(0, DL));
9525 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9526 }
9527
9528 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9529}
9530
9531SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9532 bool ImageStore) const {
9533 EVT StoreVT = VData.getValueType();
9534
9535 // No change for f16 and legal vector D16 types.
9536 if (!StoreVT.isVector())
9537 return VData;
9538
9539 SDLoc DL(VData);
9540 unsigned NumElements = StoreVT.getVectorNumElements();
9541
9542 if (Subtarget->hasUnpackedD16VMem()) {
9543 // We need to unpack the packed data to store.
9544 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9545 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9546
9547 EVT EquivStoreVT =
9548 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9549 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9550 return DAG.UnrollVectorOp(ZExt.getNode());
9551 }
9552
9553 // The sq block of gfx8.1 does not estimate register use correctly for d16
9554 // image store instructions. The data operand is computed as if it were not a
9555 // d16 image instruction.
9556 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9557 // Bitcast to i16
9558 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9559 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9560
9561 // Decompose into scalars
9563 DAG.ExtractVectorElements(IntVData, Elts);
9564
9565 // Group pairs of i16 into v2i16 and bitcast to i32
9566 SmallVector<SDValue, 4> PackedElts;
9567 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9568 SDValue Pair =
9569 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9570 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9571 PackedElts.push_back(IntPair);
9572 }
9573 if ((NumElements % 2) == 1) {
9574 // Handle v3i16
9575 unsigned I = Elts.size() / 2;
9576 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9577 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9578 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9579 PackedElts.push_back(IntPair);
9580 }
9581
9582 // Pad using UNDEF
9583 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9584
9585 // Build final vector
9586 EVT VecVT =
9587 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9588 return DAG.getBuildVector(VecVT, DL, PackedElts);
9589 }
9590
9591 if (NumElements == 3) {
9592 EVT IntStoreVT =
9594 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9595
9596 EVT WidenedStoreVT = EVT::getVectorVT(
9597 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9598 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9599 WidenedStoreVT.getStoreSizeInBits());
9600 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9601 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9602 }
9603
9604 assert(isTypeLegal(StoreVT));
9605 return VData;
9606}
9607
9608SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9609 SelectionDAG &DAG) const {
9610 SDLoc DL(Op);
9611 SDValue Chain = Op.getOperand(0);
9612 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9614
9615 switch (IntrinsicID) {
9616 case Intrinsic::amdgcn_exp_compr: {
9617 if (!Subtarget->hasCompressedExport()) {
9618 DiagnosticInfoUnsupported BadIntrin(
9620 "intrinsic not supported on subtarget", DL.getDebugLoc());
9621 DAG.getContext()->diagnose(BadIntrin);
9622 }
9623 SDValue Src0 = Op.getOperand(4);
9624 SDValue Src1 = Op.getOperand(5);
9625 // Hack around illegal type on SI by directly selecting it.
9626 if (isTypeLegal(Src0.getValueType()))
9627 return SDValue();
9628
9629 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9630 SDValue Undef = DAG.getUNDEF(MVT::f32);
9631 const SDValue Ops[] = {
9632 Op.getOperand(2), // tgt
9633 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9634 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9635 Undef, // src2
9636 Undef, // src3
9637 Op.getOperand(7), // vm
9638 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9639 Op.getOperand(3), // en
9640 Op.getOperand(0) // Chain
9641 };
9642
9643 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9644 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9645 }
9646 case Intrinsic::amdgcn_s_barrier:
9647 case Intrinsic::amdgcn_s_barrier_signal:
9648 case Intrinsic::amdgcn_s_barrier_wait: {
9651 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9652 if (WGSize <= ST.getWavefrontSize()) {
9653 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9654 // s_barrier/s_barrier_wait to wave_barrier.
9655 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9656 return Op.getOperand(0);
9657 else
9658 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9659 MVT::Other, Op.getOperand(0)),
9660 0);
9661 }
9662 }
9663
9664 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9665 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9666 SDValue K =
9668 SDValue BarSignal =
9669 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9670 MVT::Other, K, Op.getOperand(0)),
9671 0);
9672 SDValue BarWait =
9673 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9674 BarSignal.getValue(0)),
9675 0);
9676 return BarWait;
9677 }
9678
9679 return SDValue();
9680 };
9681
9682 case Intrinsic::amdgcn_struct_tbuffer_store:
9683 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9684 SDValue VData = Op.getOperand(2);
9685 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9686 if (IsD16)
9687 VData = handleD16VData(VData, DAG);
9688 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9689 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9690 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9691 SDValue Ops[] = {
9692 Chain,
9693 VData, // vdata
9694 Rsrc, // rsrc
9695 Op.getOperand(4), // vindex
9696 VOffset, // voffset
9697 SOffset, // soffset
9698 Offset, // offset
9699 Op.getOperand(7), // format
9700 Op.getOperand(8), // cachepolicy, swizzled buffer
9701 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9702 };
9703 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9705 MemSDNode *M = cast<MemSDNode>(Op);
9706 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9707 M->getMemoryVT(), M->getMemOperand());
9708 }
9709
9710 case Intrinsic::amdgcn_raw_tbuffer_store:
9711 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9712 SDValue VData = Op.getOperand(2);
9713 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9714 if (IsD16)
9715 VData = handleD16VData(VData, DAG);
9716 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9717 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9718 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9719 SDValue Ops[] = {
9720 Chain,
9721 VData, // vdata
9722 Rsrc, // rsrc
9723 DAG.getConstant(0, DL, MVT::i32), // vindex
9724 VOffset, // voffset
9725 SOffset, // soffset
9726 Offset, // offset
9727 Op.getOperand(6), // format
9728 Op.getOperand(7), // cachepolicy, swizzled buffer
9729 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9730 };
9731 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9733 MemSDNode *M = cast<MemSDNode>(Op);
9734 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9735 M->getMemoryVT(), M->getMemOperand());
9736 }
9737
9738 case Intrinsic::amdgcn_raw_buffer_store:
9739 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9740 case Intrinsic::amdgcn_raw_buffer_store_format:
9741 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9742 const bool IsFormat =
9743 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9744 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9745
9746 SDValue VData = Op.getOperand(2);
9747 EVT VDataVT = VData.getValueType();
9748 EVT EltType = VDataVT.getScalarType();
9749 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9750 if (IsD16) {
9751 VData = handleD16VData(VData, DAG);
9752 VDataVT = VData.getValueType();
9753 }
9754
9755 if (!isTypeLegal(VDataVT)) {
9756 VData =
9757 DAG.getNode(ISD::BITCAST, DL,
9758 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9759 }
9760
9761 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9762 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9763 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9764 SDValue Ops[] = {
9765 Chain,
9766 VData,
9767 Rsrc,
9768 DAG.getConstant(0, DL, MVT::i32), // vindex
9769 VOffset, // voffset
9770 SOffset, // soffset
9771 Offset, // offset
9772 Op.getOperand(6), // cachepolicy, swizzled buffer
9773 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9774 };
9775 unsigned Opc =
9777 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9778 MemSDNode *M = cast<MemSDNode>(Op);
9779
9780 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9781 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9782 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9783
9784 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9785 M->getMemoryVT(), M->getMemOperand());
9786 }
9787
9788 case Intrinsic::amdgcn_struct_buffer_store:
9789 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9790 case Intrinsic::amdgcn_struct_buffer_store_format:
9791 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9792 const bool IsFormat =
9793 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9794 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9795
9796 SDValue VData = Op.getOperand(2);
9797 EVT VDataVT = VData.getValueType();
9798 EVT EltType = VDataVT.getScalarType();
9799 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9800
9801 if (IsD16) {
9802 VData = handleD16VData(VData, DAG);
9803 VDataVT = VData.getValueType();
9804 }
9805
9806 if (!isTypeLegal(VDataVT)) {
9807 VData =
9808 DAG.getNode(ISD::BITCAST, DL,
9809 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9810 }
9811
9812 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9813 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9814 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9815 SDValue Ops[] = {
9816 Chain,
9817 VData,
9818 Rsrc,
9819 Op.getOperand(4), // vindex
9820 VOffset, // voffset
9821 SOffset, // soffset
9822 Offset, // offset
9823 Op.getOperand(7), // cachepolicy, swizzled buffer
9824 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9825 };
9826 unsigned Opc =
9828 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9829 MemSDNode *M = cast<MemSDNode>(Op);
9830
9831 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9832 EVT VDataType = VData.getValueType().getScalarType();
9833 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9834 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9835
9836 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9837 M->getMemoryVT(), M->getMemOperand());
9838 }
9839 case Intrinsic::amdgcn_raw_buffer_load_lds:
9840 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9841 case Intrinsic::amdgcn_struct_buffer_load_lds:
9842 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9843 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9844 unsigned Opc;
9845 bool HasVIndex =
9846 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9847 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9848 unsigned OpOffset = HasVIndex ? 1 : 0;
9849 SDValue VOffset = Op.getOperand(5 + OpOffset);
9850 bool HasVOffset = !isNullConstant(VOffset);
9851 unsigned Size = Op->getConstantOperandVal(4);
9852
9853 switch (Size) {
9854 default:
9855 return SDValue();
9856 case 1:
9857 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9858 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9859 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9860 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9861 break;
9862 case 2:
9863 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9864 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9865 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9866 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9867 break;
9868 case 4:
9869 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9870 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9871 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9872 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9873 break;
9874 case 12:
9875 if (!Subtarget->hasLDSLoadB96_B128())
9876 return SDValue();
9877 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9878 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9879 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9880 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9881 break;
9882 case 16:
9883 if (!Subtarget->hasLDSLoadB96_B128())
9884 return SDValue();
9885 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9886 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9887 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9888 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9889 break;
9890 }
9891
9892 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9893
9895
9896 if (HasVIndex && HasVOffset)
9897 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9898 {Op.getOperand(5), // VIndex
9899 VOffset}));
9900 else if (HasVIndex)
9901 Ops.push_back(Op.getOperand(5));
9902 else if (HasVOffset)
9903 Ops.push_back(VOffset);
9904
9905 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9906 Ops.push_back(Rsrc);
9907 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9908 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9909 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9910 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9912 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9913 DL, MVT::i8)); // cpol
9915 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9916 ? 1
9917 : 0,
9918 DL, MVT::i8)); // swz
9919 Ops.push_back(M0Val.getValue(0)); // Chain
9920 Ops.push_back(M0Val.getValue(1)); // Glue
9921
9922 auto *M = cast<MemSDNode>(Op);
9923 MachineMemOperand *LoadMMO = M->getMemOperand();
9924 // Don't set the offset value here because the pointer points to the base of
9925 // the buffer.
9926 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9927
9928 MachinePointerInfo StorePtrI = LoadPtrI;
9929 LoadPtrI.V = PoisonValue::get(
9933
9934 auto F = LoadMMO->getFlags() &
9936 LoadMMO =
9938 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9939
9941 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9942 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9943
9944 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9945 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9946
9947 return SDValue(Load, 0);
9948 }
9949 case Intrinsic::amdgcn_global_load_lds: {
9950 unsigned Opc;
9951 unsigned Size = Op->getConstantOperandVal(4);
9952 switch (Size) {
9953 default:
9954 return SDValue();
9955 case 1:
9956 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9957 break;
9958 case 2:
9959 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9960 break;
9961 case 4:
9962 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9963 break;
9964 case 12:
9965 if (!Subtarget->hasLDSLoadB96_B128())
9966 return SDValue();
9967 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
9968 break;
9969 case 16:
9970 if (!Subtarget->hasLDSLoadB96_B128())
9971 return SDValue();
9972 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
9973 break;
9974 }
9975
9976 auto *M = cast<MemSDNode>(Op);
9977 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9978
9980
9981 SDValue Addr = Op.getOperand(2); // Global ptr
9982 SDValue VOffset;
9983 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9984 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9985 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9986 SDValue LHS = Addr.getOperand(0);
9987 SDValue RHS = Addr.getOperand(1);
9988
9989 if (LHS->isDivergent())
9990 std::swap(LHS, RHS);
9991
9992 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9993 RHS.getOperand(0).getValueType() == MVT::i32) {
9994 // add (i64 sgpr), (zero_extend (i32 vgpr))
9995 Addr = LHS;
9996 VOffset = RHS.getOperand(0);
9997 }
9998 }
9999
10000 Ops.push_back(Addr);
10001 if (!Addr->isDivergent()) {
10002 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10003 if (!VOffset)
10004 VOffset =
10005 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10006 DAG.getTargetConstant(0, DL, MVT::i32)),
10007 0);
10008 Ops.push_back(VOffset);
10009 }
10010
10011 Ops.push_back(Op.getOperand(5)); // Offset
10012 Ops.push_back(Op.getOperand(6)); // CPol
10013 Ops.push_back(M0Val.getValue(0)); // Chain
10014 Ops.push_back(M0Val.getValue(1)); // Glue
10015
10016 MachineMemOperand *LoadMMO = M->getMemOperand();
10017 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10018 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10019 MachinePointerInfo StorePtrI = LoadPtrI;
10020 LoadPtrI.V = PoisonValue::get(
10024 auto F = LoadMMO->getFlags() &
10026 LoadMMO =
10028 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10030 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10031 LoadMMO->getAAInfo());
10032
10033 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10034 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10035
10036 return SDValue(Load, 0);
10037 }
10038 case Intrinsic::amdgcn_end_cf:
10039 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10040 Op->getOperand(2), Chain),
10041 0);
10042 case Intrinsic::amdgcn_s_barrier_init:
10043 case Intrinsic::amdgcn_s_barrier_signal_var: {
10044 // these two intrinsics have two operands: barrier pointer and member count
10045 SDValue Chain = Op->getOperand(0);
10047 SDValue BarOp = Op->getOperand(2);
10048 SDValue CntOp = Op->getOperand(3);
10049 SDValue M0Val;
10050 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10051 ? AMDGPU::S_BARRIER_INIT_M0
10052 : AMDGPU::S_BARRIER_SIGNAL_M0;
10053 // extract the BarrierID from bits 4-9 of BarOp
10054 SDValue BarID;
10055 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10056 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10057 BarID =
10058 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10059 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10060 0);
10061 // Member count should be put into M0[ShAmt:+6]
10062 // Barrier ID should be put into M0[5:0]
10063 M0Val =
10064 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10065 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10066 0);
10067 constexpr unsigned ShAmt = 16;
10068 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10069 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10070
10071 M0Val = SDValue(
10072 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10073
10074 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10075
10076 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10077 return SDValue(NewMI, 0);
10078 }
10079 case Intrinsic::amdgcn_s_barrier_join:
10080 case Intrinsic::amdgcn_s_wakeup_barrier: {
10081 // these three intrinsics have one operand: barrier pointer
10082 SDValue Chain = Op->getOperand(0);
10084 SDValue BarOp = Op->getOperand(2);
10085 unsigned Opc;
10086
10087 if (isa<ConstantSDNode>(BarOp)) {
10088 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10089 switch (IntrinsicID) {
10090 default:
10091 return SDValue();
10092 case Intrinsic::amdgcn_s_barrier_join:
10093 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10094 break;
10095 case Intrinsic::amdgcn_s_wakeup_barrier:
10096 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
10097 break;
10098 }
10099 // extract the BarrierID from bits 4-9 of the immediate
10100 unsigned BarID = (BarVal >> 4) & 0x3F;
10101 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10102 Ops.push_back(K);
10103 Ops.push_back(Chain);
10104 } else {
10105 switch (IntrinsicID) {
10106 default:
10107 return SDValue();
10108 case Intrinsic::amdgcn_s_barrier_join:
10109 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10110 break;
10111 case Intrinsic::amdgcn_s_wakeup_barrier:
10112 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
10113 break;
10114 }
10115 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10116 SDValue M0Val;
10117 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10118 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10119 M0Val =
10120 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10121 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10122 0);
10123 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10124 }
10125
10126 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10127 return SDValue(NewMI, 0);
10128 }
10129 case Intrinsic::amdgcn_s_prefetch_data: {
10130 // For non-global address space preserve the chain and remove the call.
10131 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10132 return Op.getOperand(0);
10133 return Op;
10134 }
10135 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10136 SDValue Ops[] = {
10137 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10138 Op.getOperand(3), // offset
10139 Op.getOperand(4), // length
10140 };
10141
10142 MemSDNode *M = cast<MemSDNode>(Op);
10144 Op->getVTList(), Ops, M->getMemoryVT(),
10145 M->getMemOperand());
10146 }
10147 default: {
10148 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10150 return lowerImage(Op, ImageDimIntr, DAG, true);
10151
10152 return Op;
10153 }
10154 }
10155}
10156
10157// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10158// offset (the offset that is included in bounds checking and swizzling, to be
10159// split between the instruction's voffset and immoffset fields) and soffset
10160// (the offset that is excluded from bounds checking and swizzling, to go in
10161// the instruction's soffset field). This function takes the first kind of
10162// offset and figures out how to split it between voffset and immoffset.
10163std::pair<SDValue, SDValue>
10164SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10165 SDLoc DL(Offset);
10166 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10167 SDValue N0 = Offset;
10168 ConstantSDNode *C1 = nullptr;
10169
10170 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10171 N0 = SDValue();
10172 else if (DAG.isBaseWithConstantOffset(N0)) {
10173 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10174 N0 = N0.getOperand(0);
10175 }
10176
10177 if (C1) {
10178 unsigned ImmOffset = C1->getZExtValue();
10179 // If the immediate value is too big for the immoffset field, put only bits
10180 // that would normally fit in the immoffset field. The remaining value that
10181 // is copied/added for the voffset field is a large power of 2, and it
10182 // stands more chance of being CSEd with the copy/add for another similar
10183 // load/store.
10184 // However, do not do that rounding down if that is a negative
10185 // number, as it appears to be illegal to have a negative offset in the
10186 // vgpr, even if adding the immediate offset makes it positive.
10187 unsigned Overflow = ImmOffset & ~MaxImm;
10188 ImmOffset -= Overflow;
10189 if ((int32_t)Overflow < 0) {
10190 Overflow += ImmOffset;
10191 ImmOffset = 0;
10192 }
10193 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10194 if (Overflow) {
10195 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10196 if (!N0)
10197 N0 = OverflowVal;
10198 else {
10199 SDValue Ops[] = {N0, OverflowVal};
10200 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10201 }
10202 }
10203 }
10204 if (!N0)
10205 N0 = DAG.getConstant(0, DL, MVT::i32);
10206 if (!C1)
10207 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10208 return {N0, SDValue(C1, 0)};
10209}
10210
10211// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10212// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10213// pointed to by Offsets.
10214void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10215 SelectionDAG &DAG, SDValue *Offsets,
10216 Align Alignment) const {
10218 SDLoc DL(CombinedOffset);
10219 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10220 uint32_t Imm = C->getZExtValue();
10221 uint32_t SOffset, ImmOffset;
10222 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10223 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10224 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10225 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10226 return;
10227 }
10228 }
10229 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10230 SDValue N0 = CombinedOffset.getOperand(0);
10231 SDValue N1 = CombinedOffset.getOperand(1);
10232 uint32_t SOffset, ImmOffset;
10233 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10234 if (Offset >= 0 &&
10235 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10236 Offsets[0] = N0;
10237 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10238 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10239 return;
10240 }
10241 }
10242
10243 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10244 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10245 : DAG.getConstant(0, DL, MVT::i32);
10246
10247 Offsets[0] = CombinedOffset;
10248 Offsets[1] = SOffsetZero;
10249 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10250}
10251
10252SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10253 SelectionDAG &DAG) const {
10254 if (!MaybePointer.getValueType().isScalarInteger())
10255 return MaybePointer;
10256
10257 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10258 return Rsrc;
10259}
10260
10261// Wrap a global or flat pointer into a buffer intrinsic using the flags
10262// specified in the intrinsic.
10263SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10264 SelectionDAG &DAG) const {
10265 SDLoc Loc(Op);
10266
10267 SDValue Pointer = Op->getOperand(1);
10268 SDValue Stride = Op->getOperand(2);
10269 SDValue NumRecords = Op->getOperand(3);
10270 SDValue Flags = Op->getOperand(4);
10271
10272 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10273 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10274 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10275 std::optional<uint32_t> ConstStride = std::nullopt;
10276 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10277 ConstStride = ConstNode->getZExtValue();
10278
10279 SDValue NewHighHalf = Masked;
10280 if (!ConstStride || *ConstStride != 0) {
10281 SDValue ShiftedStride;
10282 if (ConstStride) {
10283 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10284 } else {
10285 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10286 ShiftedStride =
10287 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10288 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10289 }
10290 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10291 }
10292
10293 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10294 NewHighHalf, NumRecords, Flags);
10295 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10296 return RsrcPtr;
10297}
10298
10299// Handle 8 bit and 16 bit buffer loads
10300SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10301 EVT LoadVT, SDLoc DL,
10303 MachineMemOperand *MMO,
10304 bool IsTFE) const {
10305 EVT IntVT = LoadVT.changeTypeToInteger();
10306
10307 if (IsTFE) {
10308 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10312 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10313 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10314 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10316 DAG.getConstant(1, DL, MVT::i32));
10317 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10318 DAG.getConstant(0, DL, MVT::i32));
10319 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10320 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10321 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10322 }
10323
10324 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10327
10328 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10329 SDValue BufferLoad =
10330 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10331 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10332 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10333
10334 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10335}
10336
10337// Handle 8 bit and 16 bit buffer stores
10338SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10339 EVT VDataType, SDLoc DL,
10340 SDValue Ops[],
10341 MemSDNode *M) const {
10342 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10343 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10344
10345 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10346 Ops[1] = BufferStoreExt;
10347 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10349 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10350 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10351 M->getMemOperand());
10352}
10353
10355 SDValue Op, const SDLoc &SL, EVT VT) {
10356 if (VT.bitsLT(Op.getValueType()))
10357 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10358
10359 switch (ExtType) {
10360 case ISD::SEXTLOAD:
10361 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10362 case ISD::ZEXTLOAD:
10363 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10364 case ISD::EXTLOAD:
10365 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10366 case ISD::NON_EXTLOAD:
10367 return Op;
10368 }
10369
10370 llvm_unreachable("invalid ext type");
10371}
10372
10373// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10374// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10375SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10376 DAGCombinerInfo &DCI) const {
10377 SelectionDAG &DAG = DCI.DAG;
10378 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10379 return SDValue();
10380
10381 // FIXME: Constant loads should all be marked invariant.
10382 unsigned AS = Ld->getAddressSpace();
10383 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10385 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10386 return SDValue();
10387
10388 // Don't do this early, since it may interfere with adjacent load merging for
10389 // illegal types. We can avoid losing alignment information for exotic types
10390 // pre-legalize.
10391 EVT MemVT = Ld->getMemoryVT();
10392 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10393 MemVT.getSizeInBits() >= 32)
10394 return SDValue();
10395
10396 SDLoc SL(Ld);
10397
10398 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10399 "unexpected vector extload");
10400
10401 // TODO: Drop only high part of range.
10402 SDValue Ptr = Ld->getBasePtr();
10403 SDValue NewLoad = DAG.getLoad(
10404 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10405 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10406 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10407 nullptr); // Drop ranges
10408
10409 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10410 if (MemVT.isFloatingPoint()) {
10412 "unexpected fp extload");
10413 TruncVT = MemVT.changeTypeToInteger();
10414 }
10415
10416 SDValue Cvt = NewLoad;
10417 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10418 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10419 DAG.getValueType(TruncVT));
10420 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10422 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10423 } else {
10425 }
10426
10427 EVT VT = Ld->getValueType(0);
10428 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10429
10430 DCI.AddToWorklist(Cvt.getNode());
10431
10432 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10433 // the appropriate extension from the 32-bit load.
10434 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10435 DCI.AddToWorklist(Cvt.getNode());
10436
10437 // Handle conversion back to floating point if necessary.
10438 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10439
10440 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10441}
10442
10444 const SIMachineFunctionInfo &Info) {
10445 // TODO: Should check if the address can definitely not access stack.
10446 if (Info.isEntryFunction())
10447 return Info.getUserSGPRInfo().hasFlatScratchInit();
10448 return true;
10449}
10450
10451SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10452 SDLoc DL(Op);
10453 LoadSDNode *Load = cast<LoadSDNode>(Op);
10454 ISD::LoadExtType ExtType = Load->getExtensionType();
10455 EVT MemVT = Load->getMemoryVT();
10456 MachineMemOperand *MMO = Load->getMemOperand();
10457
10458 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10459 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10460 return SDValue();
10461
10462 // FIXME: Copied from PPC
10463 // First, load into 32 bits, then truncate to 1 bit.
10464
10465 SDValue Chain = Load->getChain();
10466 SDValue BasePtr = Load->getBasePtr();
10467
10468 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10469
10470 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10471 RealMemVT, MMO);
10472
10473 if (!MemVT.isVector()) {
10474 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10475 NewLD.getValue(1)};
10476
10477 return DAG.getMergeValues(Ops, DL);
10478 }
10479
10481 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10482 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10483 DAG.getConstant(I, DL, MVT::i32));
10484
10485 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10486 }
10487
10488 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10489
10490 return DAG.getMergeValues(Ops, DL);
10491 }
10492
10493 if (!MemVT.isVector())
10494 return SDValue();
10495
10496 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10497 "Custom lowering for non-i32 vectors hasn't been implemented.");
10498
10499 Align Alignment = Load->getAlign();
10500 unsigned AS = Load->getAddressSpace();
10501 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10502 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10503 return SplitVectorLoad(Op, DAG);
10504 }
10505
10508 // If there is a possibility that flat instruction access scratch memory
10509 // then we need to use the same legalization rules we use for private.
10510 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10512 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10515
10516 unsigned NumElements = MemVT.getVectorNumElements();
10517
10518 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10520 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10521 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10523 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10524 Alignment >= Align(4) && NumElements < 32) {
10525 if (MemVT.isPow2VectorType() ||
10526 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10527 return SDValue();
10528 return WidenOrSplitVectorLoad(Op, DAG);
10529 }
10530 // Non-uniform loads will be selected to MUBUF instructions, so they
10531 // have the same legalization requirements as global and private
10532 // loads.
10533 //
10534 }
10535 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10538 if (NumElements > 4)
10539 return SplitVectorLoad(Op, DAG);
10540 // v3 loads not supported on SI.
10541 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10542 return WidenOrSplitVectorLoad(Op, DAG);
10543
10544 // v3 and v4 loads are supported for private and global memory.
10545 return SDValue();
10546 }
10547 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10548 // Depending on the setting of the private_element_size field in the
10549 // resource descriptor, we can only make private accesses up to a certain
10550 // size.
10551 switch (Subtarget->getMaxPrivateElementSize()) {
10552 case 4: {
10553 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10554 return DAG.getMergeValues({Op0, Op1}, DL);
10555 }
10556 case 8:
10557 if (NumElements > 2)
10558 return SplitVectorLoad(Op, DAG);
10559 return SDValue();
10560 case 16:
10561 // Same as global/flat
10562 if (NumElements > 4)
10563 return SplitVectorLoad(Op, DAG);
10564 // v3 loads not supported on SI.
10565 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10566 return WidenOrSplitVectorLoad(Op, DAG);
10567
10568 return SDValue();
10569 default:
10570 llvm_unreachable("unsupported private_element_size");
10571 }
10572 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10573 unsigned Fast = 0;
10574 auto Flags = Load->getMemOperand()->getFlags();
10576 Load->getAlign(), Flags, &Fast) &&
10577 Fast > 1)
10578 return SDValue();
10579
10580 if (MemVT.isVector())
10581 return SplitVectorLoad(Op, DAG);
10582 }
10583
10585 MemVT, *Load->getMemOperand())) {
10586 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10587 return DAG.getMergeValues({Op0, Op1}, DL);
10588 }
10589
10590 return SDValue();
10591}
10592
10593SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10594 EVT VT = Op.getValueType();
10595 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10596 VT.getSizeInBits() == 512)
10597 return splitTernaryVectorOp(Op, DAG);
10598
10599 assert(VT.getSizeInBits() == 64);
10600
10601 SDLoc DL(Op);
10602 SDValue Cond = Op.getOperand(0);
10603
10604 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10605 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10606
10607 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10608 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10609
10610 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10611 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10612
10613 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10614
10615 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10616 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10617
10618 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10619
10620 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10621 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10622}
10623
10624// Catch division cases where we can use shortcuts with rcp and rsq
10625// instructions.
10626SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10627 SelectionDAG &DAG) const {
10628 SDLoc SL(Op);
10629 SDValue LHS = Op.getOperand(0);
10630 SDValue RHS = Op.getOperand(1);
10631 EVT VT = Op.getValueType();
10632 const SDNodeFlags Flags = Op->getFlags();
10633
10634 bool AllowInaccurateRcp =
10635 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10636
10637 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10638 // Without !fpmath accuracy information, we can't do more because we don't
10639 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10640 // f16 is always accurate enough
10641 if (!AllowInaccurateRcp && VT != MVT::f16)
10642 return SDValue();
10643
10644 if (CLHS->isExactlyValue(1.0)) {
10645 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10646 // the CI documentation has a worst case error of 1 ulp.
10647 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10648 // use it as long as we aren't trying to use denormals.
10649 //
10650 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10651
10652 // 1.0 / sqrt(x) -> rsq(x)
10653
10654 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10655 // error seems really high at 2^29 ULP.
10656 // 1.0 / x -> rcp(x)
10657 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10658 }
10659
10660 // Same as for 1.0, but expand the sign out of the constant.
10661 if (CLHS->isExactlyValue(-1.0)) {
10662 // -1.0 / x -> rcp (fneg x)
10663 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10664 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10665 }
10666 }
10667
10668 // For f16 require afn or arcp.
10669 // For f32 require afn.
10670 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10671 return SDValue();
10672
10673 // Turn into multiply by the reciprocal.
10674 // x / y -> x * (1.0 / y)
10675 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10676 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10677}
10678
10679SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10680 SelectionDAG &DAG) const {
10681 SDLoc SL(Op);
10682 SDValue X = Op.getOperand(0);
10683 SDValue Y = Op.getOperand(1);
10684 EVT VT = Op.getValueType();
10685 const SDNodeFlags Flags = Op->getFlags();
10686
10687 bool AllowInaccurateDiv =
10688 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10689 if (!AllowInaccurateDiv)
10690 return SDValue();
10691
10692 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10693 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10694
10695 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10696 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10697
10698 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10699 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10700 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10701 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10702 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10703 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10704}
10705
10706static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10707 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10708 SDNodeFlags Flags) {
10709 if (GlueChain->getNumValues() <= 1) {
10710 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10711 }
10712
10713 assert(GlueChain->getNumValues() == 3);
10714
10715 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10716 switch (Opcode) {
10717 default:
10718 llvm_unreachable("no chain equivalent for opcode");
10719 case ISD::FMUL:
10720 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10721 break;
10722 }
10723
10724 return DAG.getNode(Opcode, SL, VTList,
10725 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10726 Flags);
10727}
10728
10729static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10730 EVT VT, SDValue A, SDValue B, SDValue C,
10731 SDValue GlueChain, SDNodeFlags Flags) {
10732 if (GlueChain->getNumValues() <= 1) {
10733 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10734 }
10735
10736 assert(GlueChain->getNumValues() == 3);
10737
10738 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10739 switch (Opcode) {
10740 default:
10741 llvm_unreachable("no chain equivalent for opcode");
10742 case ISD::FMA:
10743 Opcode = AMDGPUISD::FMA_W_CHAIN;
10744 break;
10745 }
10746
10747 return DAG.getNode(Opcode, SL, VTList,
10748 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10749 Flags);
10750}
10751
10752SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10753 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10754 return FastLowered;
10755
10756 SDLoc SL(Op);
10757 SDValue LHS = Op.getOperand(0);
10758 SDValue RHS = Op.getOperand(1);
10759
10760 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10761 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10762 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10763 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10764 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10765 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10766 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10767 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10768 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10769 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10770 // q16.u = opx(V_CVT_F16_F32, q32.u);
10771 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10772
10773 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10774 unsigned FMADOpCode =
10776
10777 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10778 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10779 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10780 SDValue Rcp =
10781 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10782 SDValue Quot =
10783 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10784 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10785 Op->getFlags());
10786 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10787 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10788 Op->getFlags());
10789 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10790 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10791 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10792 DAG.getConstant(0xff800000, SL, MVT::i32));
10793 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10794 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10795 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10796 DAG.getTargetConstant(0, SL, MVT::i32));
10797 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10798 Op->getFlags());
10799}
10800
10801// Faster 2.5 ULP division that does not support denormals.
10802SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10803 SDNodeFlags Flags = Op->getFlags();
10804 SDLoc SL(Op);
10805 SDValue LHS = Op.getOperand(1);
10806 SDValue RHS = Op.getOperand(2);
10807
10808 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10809
10810 const APFloat K0Val(0x1p+96f);
10811 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10812
10813 const APFloat K1Val(0x1p-32f);
10814 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10815
10816 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10817
10818 EVT SetCCVT =
10819 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10820
10821 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10822
10823 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10824
10825 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10826
10827 // rcp does not support denormals.
10828 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10829
10830 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10831
10832 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10833}
10834
10835// Returns immediate value for setting the F32 denorm mode when using the
10836// S_DENORM_MODE instruction.
10838 const SIMachineFunctionInfo *Info,
10839 const GCNSubtarget *ST) {
10840 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10841 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10842 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10843 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10844}
10845
10846SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10847 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10848 return FastLowered;
10849
10850 // The selection matcher assumes anything with a chain selecting to a
10851 // mayRaiseFPException machine instruction. Since we're introducing a chain
10852 // here, we need to explicitly report nofpexcept for the regular fdiv
10853 // lowering.
10854 SDNodeFlags Flags = Op->getFlags();
10855 Flags.setNoFPExcept(true);
10856
10857 SDLoc SL(Op);
10858 SDValue LHS = Op.getOperand(0);
10859 SDValue RHS = Op.getOperand(1);
10860
10861 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10862
10863 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10864
10865 SDValue DenominatorScaled =
10866 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10867 SDValue NumeratorScaled =
10868 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10869
10870 // Denominator is scaled to not be denormal, so using rcp is ok.
10871 SDValue ApproxRcp =
10872 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10873 SDValue NegDivScale0 =
10874 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10875
10876 using namespace AMDGPU::Hwreg;
10877 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10878 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10879
10880 const MachineFunction &MF = DAG.getMachineFunction();
10882 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10883
10884 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10885 const bool HasDynamicDenormals =
10886 (DenormMode.Input == DenormalMode::Dynamic) ||
10887 (DenormMode.Output == DenormalMode::Dynamic);
10888
10889 SDValue SavedDenormMode;
10890
10891 if (!PreservesDenormals) {
10892 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10893 // lowering. The chain dependence is insufficient, and we need glue. We do
10894 // not need the glue variants in a strictfp function.
10895
10896 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10897
10898 SDValue Glue = DAG.getEntryNode();
10899 if (HasDynamicDenormals) {
10900 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10901 DAG.getVTList(MVT::i32, MVT::Glue),
10902 {BitField, Glue});
10903 SavedDenormMode = SDValue(GetReg, 0);
10904
10905 Glue = DAG.getMergeValues(
10906 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10907 }
10908
10909 SDNode *EnableDenorm;
10910 if (Subtarget->hasDenormModeInst()) {
10911 const SDValue EnableDenormValue =
10912 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10913
10914 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10915 EnableDenormValue)
10916 .getNode();
10917 } else {
10918 const SDValue EnableDenormValue =
10919 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
10920 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10921 {EnableDenormValue, BitField, Glue});
10922 }
10923
10924 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
10925 SDValue(EnableDenorm, 1)};
10926
10927 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10928 }
10929
10930 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10931 ApproxRcp, One, NegDivScale0, Flags);
10932
10933 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10934 ApproxRcp, Fma0, Flags);
10935
10936 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
10937 Fma1, Flags);
10938
10939 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10940 NumeratorScaled, Mul, Flags);
10941
10942 SDValue Fma3 =
10943 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
10944
10945 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10946 NumeratorScaled, Fma3, Flags);
10947
10948 if (!PreservesDenormals) {
10949 SDNode *DisableDenorm;
10950 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10951 const SDValue DisableDenormValue = getSPDenormModeValue(
10952 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10953
10954 DisableDenorm =
10955 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
10956 DisableDenormValue, Fma4.getValue(2))
10957 .getNode();
10958 } else {
10959 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10960 const SDValue DisableDenormValue =
10961 HasDynamicDenormals
10962 ? SavedDenormMode
10963 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10964
10965 DisableDenorm = DAG.getMachineNode(
10966 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10967 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10968 }
10969
10970 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10971 SDValue(DisableDenorm, 0), DAG.getRoot());
10972 DAG.setRoot(OutputChain);
10973 }
10974
10975 SDValue Scale = NumeratorScaled.getValue(1);
10976 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10977 {Fma4, Fma1, Fma3, Scale}, Flags);
10978
10979 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10980}
10981
10982SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10983 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10984 return FastLowered;
10985
10986 SDLoc SL(Op);
10987 SDValue X = Op.getOperand(0);
10988 SDValue Y = Op.getOperand(1);
10989
10990 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10991
10992 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10993
10994 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10995
10996 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10997
10998 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10999
11000 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11001
11002 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11003
11004 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11005
11006 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11007
11008 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11009 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11010
11011 SDValue Fma4 =
11012 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11013
11014 SDValue Scale;
11015
11016 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11017 // Workaround a hardware bug on SI where the condition output from div_scale
11018 // is not usable.
11019
11020 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11021
11022 // Figure out if the scale to use for div_fmas.
11023 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11024 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11025 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11026 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11027
11028 SDValue NumHi =
11029 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11030 SDValue DenHi =
11031 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11032
11033 SDValue Scale0Hi =
11034 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11035 SDValue Scale1Hi =
11036 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11037
11038 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11039 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11040 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11041 } else {
11042 Scale = DivScale1.getValue(1);
11043 }
11044
11045 SDValue Fmas =
11046 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11047
11048 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11049}
11050
11051SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11052 EVT VT = Op.getValueType();
11053
11054 if (VT == MVT::f32)
11055 return LowerFDIV32(Op, DAG);
11056
11057 if (VT == MVT::f64)
11058 return LowerFDIV64(Op, DAG);
11059
11060 if (VT == MVT::f16)
11061 return LowerFDIV16(Op, DAG);
11062
11063 llvm_unreachable("Unexpected type for fdiv");
11064}
11065
11066SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11067 SDLoc dl(Op);
11068 SDValue Val = Op.getOperand(0);
11069 EVT VT = Val.getValueType();
11070 EVT ResultExpVT = Op->getValueType(1);
11071 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11072
11073 SDValue Mant = DAG.getNode(
11075 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11076
11077 SDValue Exp = DAG.getNode(
11078 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11079 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11080
11081 if (Subtarget->hasFractBug()) {
11082 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11083 SDValue Inf =
11085
11086 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11087 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11088 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11089 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11090 }
11091
11092 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11093 return DAG.getMergeValues({Mant, CastExp}, dl);
11094}
11095
11096SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11097 SDLoc DL(Op);
11098 StoreSDNode *Store = cast<StoreSDNode>(Op);
11099 EVT VT = Store->getMemoryVT();
11100
11101 if (VT == MVT::i1) {
11102 return DAG.getTruncStore(
11103 Store->getChain(), DL,
11104 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11105 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11106 }
11107
11108 assert(VT.isVector() &&
11109 Store->getValue().getValueType().getScalarType() == MVT::i32);
11110
11111 unsigned AS = Store->getAddressSpace();
11112 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11113 Store->getAlign().value() < VT.getStoreSize() &&
11114 VT.getSizeInBits() > 32) {
11115 return SplitVectorStore(Op, DAG);
11116 }
11117
11120 // If there is a possibility that flat instruction access scratch memory
11121 // then we need to use the same legalization rules we use for private.
11122 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11124 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11127
11128 unsigned NumElements = VT.getVectorNumElements();
11130 if (NumElements > 4)
11131 return SplitVectorStore(Op, DAG);
11132 // v3 stores not supported on SI.
11133 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11134 return SplitVectorStore(Op, DAG);
11135
11137 VT, *Store->getMemOperand()))
11138 return expandUnalignedStore(Store, DAG);
11139
11140 return SDValue();
11141 }
11142 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11143 switch (Subtarget->getMaxPrivateElementSize()) {
11144 case 4:
11145 return scalarizeVectorStore(Store, DAG);
11146 case 8:
11147 if (NumElements > 2)
11148 return SplitVectorStore(Op, DAG);
11149 return SDValue();
11150 case 16:
11151 if (NumElements > 4 ||
11152 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11153 return SplitVectorStore(Op, DAG);
11154 return SDValue();
11155 default:
11156 llvm_unreachable("unsupported private_element_size");
11157 }
11158 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11159 unsigned Fast = 0;
11160 auto Flags = Store->getMemOperand()->getFlags();
11162 Store->getAlign(), Flags, &Fast) &&
11163 Fast > 1)
11164 return SDValue();
11165
11166 if (VT.isVector())
11167 return SplitVectorStore(Op, DAG);
11168
11169 return expandUnalignedStore(Store, DAG);
11170 }
11171
11172 // Probably an invalid store. If so we'll end up emitting a selection error.
11173 return SDValue();
11174}
11175
11176// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11177SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11178 SDLoc SL(Op);
11179 assert(!Subtarget->has16BitInsts());
11180 SDNodeFlags Flags = Op->getFlags();
11181 SDValue Ext =
11182 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11183
11184 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11185 SDValue Sqrt =
11186 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11187
11188 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11189 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11190}
11191
11192SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11193 SDLoc DL(Op);
11194 SDNodeFlags Flags = Op->getFlags();
11195 MVT VT = Op.getValueType().getSimpleVT();
11196 const SDValue X = Op.getOperand(0);
11197
11198 if (allowApproxFunc(DAG, Flags)) {
11199 // Instruction is 1ulp but ignores denormals.
11200 return DAG.getNode(
11202 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11203 }
11204
11205 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11206 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11207
11208 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11209
11210 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11211
11212 SDValue SqrtX =
11213 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11214
11215 SDValue SqrtS;
11216 if (needsDenormHandlingF32(DAG, X, Flags)) {
11217 SDValue SqrtID =
11218 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11219 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11220
11221 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11222 SDValue SqrtSNextDownInt =
11223 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11224 DAG.getAllOnesConstant(DL, MVT::i32));
11225 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11226
11227 SDValue NegSqrtSNextDown =
11228 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11229
11230 SDValue SqrtVP =
11231 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11232
11233 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11234 DAG.getConstant(1, DL, MVT::i32));
11235 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11236
11237 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11238 SDValue SqrtVS =
11239 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11240
11241 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11242 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11243
11244 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11245 Flags);
11246
11247 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11248 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11249 Flags);
11250 } else {
11251 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11252
11253 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11254
11255 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11256 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11257 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11258
11259 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11260 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11261 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11262
11263 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11264 SDValue SqrtD =
11265 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11266 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11267 }
11268
11269 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11270
11271 SDValue ScaledDown =
11272 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11273
11274 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11275 SDValue IsZeroOrInf =
11276 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11277 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11278
11279 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11280}
11281
11282SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11283 // For double type, the SQRT and RSQ instructions don't have required
11284 // precision, we apply Goldschmidt's algorithm to improve the result:
11285 //
11286 // y0 = rsq(x)
11287 // g0 = x * y0
11288 // h0 = 0.5 * y0
11289 //
11290 // r0 = 0.5 - h0 * g0
11291 // g1 = g0 * r0 + g0
11292 // h1 = h0 * r0 + h0
11293 //
11294 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11295 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11296 // h2 = h1 * r1 + h1
11297 //
11298 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11299 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11300 //
11301 // sqrt(x) = g3
11302
11303 SDNodeFlags Flags = Op->getFlags();
11304
11305 SDLoc DL(Op);
11306
11307 SDValue X = Op.getOperand(0);
11308 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11309
11310 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11311
11312 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11313
11314 // Scale up input if it is too small.
11315 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11316 SDValue ScaleUp =
11317 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11318 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11319
11320 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11321
11322 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11323
11324 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11325 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11326
11327 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11328 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11329
11330 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11331
11332 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11333
11334 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11335 SDValue SqrtD0 =
11336 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11337
11338 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11339
11340 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11341 SDValue SqrtD1 =
11342 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11343
11344 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11345
11346 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11347 SDValue ScaleDown =
11348 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11349 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11350
11351 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11352 // with finite only or nsz because rsq(+/-0) = +/-inf
11353
11354 // TODO: Check for DAZ and expand to subnormals
11355 SDValue IsZeroOrInf =
11356 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11357 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11358
11359 // If x is +INF, +0, or -0, use its original value
11360 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11361 Flags);
11362}
11363
11364SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11365 SDLoc DL(Op);
11366 EVT VT = Op.getValueType();
11367 SDValue Arg = Op.getOperand(0);
11368 SDValue TrigVal;
11369
11370 // Propagate fast-math flags so that the multiply we introduce can be folded
11371 // if Arg is already the result of a multiply by constant.
11372 auto Flags = Op->getFlags();
11373
11374 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11375
11376 if (Subtarget->hasTrigReducedRange()) {
11377 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11378 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11379 } else {
11380 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11381 }
11382
11383 switch (Op.getOpcode()) {
11384 case ISD::FCOS:
11385 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11386 case ISD::FSIN:
11387 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11388 default:
11389 llvm_unreachable("Wrong trig opcode");
11390 }
11391}
11392
11393SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11394 SelectionDAG &DAG) const {
11395 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11396 assert(AtomicNode->isCompareAndSwap());
11397 unsigned AS = AtomicNode->getAddressSpace();
11398
11399 // No custom lowering required for local address space
11401 return Op;
11402
11403 // Non-local address space requires custom lowering for atomic compare
11404 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11405 SDLoc DL(Op);
11406 SDValue ChainIn = Op.getOperand(0);
11407 SDValue Addr = Op.getOperand(1);
11408 SDValue Old = Op.getOperand(2);
11409 SDValue New = Op.getOperand(3);
11410 EVT VT = Op.getValueType();
11411 MVT SimpleVT = VT.getSimpleVT();
11412 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11413
11414 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11415 SDValue Ops[] = {ChainIn, Addr, NewOld};
11416
11418 Op->getVTList(), Ops, VT,
11419 AtomicNode->getMemOperand());
11420}
11421
11422//===----------------------------------------------------------------------===//
11423// Custom DAG optimizations
11424//===----------------------------------------------------------------------===//
11425
11426SDValue
11427SITargetLowering::performUCharToFloatCombine(SDNode *N,
11428 DAGCombinerInfo &DCI) const {
11429 EVT VT = N->getValueType(0);
11430 EVT ScalarVT = VT.getScalarType();
11431 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11432 return SDValue();
11433
11434 SelectionDAG &DAG = DCI.DAG;
11435 SDLoc DL(N);
11436
11437 SDValue Src = N->getOperand(0);
11438 EVT SrcVT = Src.getValueType();
11439
11440 // TODO: We could try to match extracting the higher bytes, which would be
11441 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11442 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11443 // about in practice.
11444 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11445 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11446 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11447 DCI.AddToWorklist(Cvt.getNode());
11448
11449 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11450 if (ScalarVT != MVT::f32) {
11451 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11452 DAG.getTargetConstant(0, DL, MVT::i32));
11453 }
11454 return Cvt;
11455 }
11456 }
11457
11458 return SDValue();
11459}
11460
11461SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11462 DAGCombinerInfo &DCI) const {
11463 SDValue MagnitudeOp = N->getOperand(0);
11464 SDValue SignOp = N->getOperand(1);
11465 SelectionDAG &DAG = DCI.DAG;
11466 SDLoc DL(N);
11467
11468 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11469 // lower half with a copy.
11470 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11471 if (MagnitudeOp.getValueType() == MVT::f64) {
11472 SDValue MagAsVector =
11473 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11474 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11475 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11476 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11477 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11478
11479 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11480
11481 SDValue Vector =
11482 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11483
11484 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11485 }
11486
11487 if (SignOp.getValueType() != MVT::f64)
11488 return SDValue();
11489
11490 // Reduce width of sign operand, we only need the highest bit.
11491 //
11492 // fcopysign f64:x, f64:y ->
11493 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11494 // TODO: In some cases it might make sense to go all the way to f16.
11495 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11496 SDValue SignAsF32 =
11497 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11498 DAG.getConstant(1, DL, MVT::i32));
11499
11500 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11501 SignAsF32);
11502}
11503
11504// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11505// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11506// bits
11507
11508// This is a variant of
11509// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11510//
11511// The normal DAG combiner will do this, but only if the add has one use since
11512// that would increase the number of instructions.
11513//
11514// This prevents us from seeing a constant offset that can be folded into a
11515// memory instruction's addressing mode. If we know the resulting add offset of
11516// a pointer can be folded into an addressing offset, we can replace the pointer
11517// operand with the add of new constant offset. This eliminates one of the uses,
11518// and may allow the remaining use to also be simplified.
11519//
11520SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11521 EVT MemVT,
11522 DAGCombinerInfo &DCI) const {
11523 SDValue N0 = N->getOperand(0);
11524 SDValue N1 = N->getOperand(1);
11525
11526 // We only do this to handle cases where it's profitable when there are
11527 // multiple uses of the add, so defer to the standard combine.
11528 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11529 N0->hasOneUse())
11530 return SDValue();
11531
11532 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11533 if (!CN1)
11534 return SDValue();
11535
11536 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11537 if (!CAdd)
11538 return SDValue();
11539
11540 SelectionDAG &DAG = DCI.DAG;
11541
11542 if (N0->getOpcode() == ISD::OR &&
11543 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11544 return SDValue();
11545
11546 // If the resulting offset is too large, we can't fold it into the
11547 // addressing mode offset.
11548 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11549 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11550
11551 AddrMode AM;
11552 AM.HasBaseReg = true;
11553 AM.BaseOffs = Offset.getSExtValue();
11554 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11555 return SDValue();
11556
11557 SDLoc SL(N);
11558 EVT VT = N->getValueType(0);
11559
11560 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11561 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11562
11564 Flags.setNoUnsignedWrap(
11565 N->getFlags().hasNoUnsignedWrap() &&
11566 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11567
11568 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11569}
11570
11571/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11572/// by the chain and intrinsic ID. Theoretically we would also need to check the
11573/// specific intrinsic, but they all place the pointer operand first.
11574static unsigned getBasePtrIndex(const MemSDNode *N) {
11575 switch (N->getOpcode()) {
11576 case ISD::STORE:
11579 return 2;
11580 default:
11581 return 1;
11582 }
11583}
11584
11585SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11586 DAGCombinerInfo &DCI) const {
11587 SelectionDAG &DAG = DCI.DAG;
11588 SDLoc SL(N);
11589
11590 unsigned PtrIdx = getBasePtrIndex(N);
11591 SDValue Ptr = N->getOperand(PtrIdx);
11592
11593 // TODO: We could also do this for multiplies.
11594 if (Ptr.getOpcode() == ISD::SHL) {
11595 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11596 N->getMemoryVT(), DCI);
11597 if (NewPtr) {
11598 SmallVector<SDValue, 8> NewOps(N->ops());
11599
11600 NewOps[PtrIdx] = NewPtr;
11601 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11602 }
11603 }
11604
11605 return SDValue();
11606}
11607
11608static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11609 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11610 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11611 (Opc == ISD::XOR && Val == 0);
11612}
11613
11614// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11615// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11616// integer combine opportunities since most 64-bit operations are decomposed
11617// this way. TODO: We won't want this for SALU especially if it is an inline
11618// immediate.
11619SDValue SITargetLowering::splitBinaryBitConstantOp(
11620 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11621 const ConstantSDNode *CRHS) const {
11622 uint64_t Val = CRHS->getZExtValue();
11623 uint32_t ValLo = Lo_32(Val);
11624 uint32_t ValHi = Hi_32(Val);
11626
11627 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11628 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11629 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11630 // If we need to materialize a 64-bit immediate, it will be split up later
11631 // anyway. Avoid creating the harder to understand 64-bit immediate
11632 // materialization.
11633 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11634 }
11635
11636 return SDValue();
11637}
11638
11640 if (V.getValueType() != MVT::i1)
11641 return false;
11642 switch (V.getOpcode()) {
11643 default:
11644 break;
11645 case ISD::SETCC:
11647 return true;
11648 case ISD::AND:
11649 case ISD::OR:
11650 case ISD::XOR:
11651 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11652 }
11653 return false;
11654}
11655
11656// If a constant has all zeroes or all ones within each byte return it.
11657// Otherwise return 0.
11659 // 0xff for any zero byte in the mask
11660 uint32_t ZeroByteMask = 0;
11661 if (!(C & 0x000000ff))
11662 ZeroByteMask |= 0x000000ff;
11663 if (!(C & 0x0000ff00))
11664 ZeroByteMask |= 0x0000ff00;
11665 if (!(C & 0x00ff0000))
11666 ZeroByteMask |= 0x00ff0000;
11667 if (!(C & 0xff000000))
11668 ZeroByteMask |= 0xff000000;
11669 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11670 if ((NonZeroByteMask & C) != NonZeroByteMask)
11671 return 0; // Partial bytes selected.
11672 return C;
11673}
11674
11675// Check if a node selects whole bytes from its operand 0 starting at a byte
11676// boundary while masking the rest. Returns select mask as in the v_perm_b32
11677// or -1 if not succeeded.
11678// Note byte select encoding:
11679// value 0-3 selects corresponding source byte;
11680// value 0xc selects zero;
11681// value 0xff selects 0xff.
11683 assert(V.getValueSizeInBits() == 32);
11684
11685 if (V.getNumOperands() != 2)
11686 return ~0;
11687
11688 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11689 if (!N1)
11690 return ~0;
11691
11692 uint32_t C = N1->getZExtValue();
11693
11694 switch (V.getOpcode()) {
11695 default:
11696 break;
11697 case ISD::AND:
11698 if (uint32_t ConstMask = getConstantPermuteMask(C))
11699 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11700 break;
11701
11702 case ISD::OR:
11703 if (uint32_t ConstMask = getConstantPermuteMask(C))
11704 return (0x03020100 & ~ConstMask) | ConstMask;
11705 break;
11706
11707 case ISD::SHL:
11708 if (C % 8)
11709 return ~0;
11710
11711 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11712
11713 case ISD::SRL:
11714 if (C % 8)
11715 return ~0;
11716
11717 return uint32_t(0x0c0c0c0c03020100ull >> C);
11718 }
11719
11720 return ~0;
11721}
11722
11723SDValue SITargetLowering::performAndCombine(SDNode *N,
11724 DAGCombinerInfo &DCI) const {
11725 if (DCI.isBeforeLegalize())
11726 return SDValue();
11727
11728 SelectionDAG &DAG = DCI.DAG;
11729 EVT VT = N->getValueType(0);
11730 SDValue LHS = N->getOperand(0);
11731 SDValue RHS = N->getOperand(1);
11732
11733 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11734 if (VT == MVT::i64 && CRHS) {
11735 if (SDValue Split =
11736 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11737 return Split;
11738 }
11739
11740 if (CRHS && VT == MVT::i32) {
11741 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11742 // nb = number of trailing zeroes in mask
11743 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11744 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11745 uint64_t Mask = CRHS->getZExtValue();
11746 unsigned Bits = llvm::popcount(Mask);
11747 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11748 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11749 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11750 unsigned Shift = CShift->getZExtValue();
11751 unsigned NB = CRHS->getAPIntValue().countr_zero();
11752 unsigned Offset = NB + Shift;
11753 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11754 SDLoc SL(N);
11755 SDValue BFE =
11756 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11757 DAG.getConstant(Offset, SL, MVT::i32),
11758 DAG.getConstant(Bits, SL, MVT::i32));
11759 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11760 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11761 DAG.getValueType(NarrowVT));
11762 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11763 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11764 return Shl;
11765 }
11766 }
11767 }
11768
11769 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11770 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11771 isa<ConstantSDNode>(LHS.getOperand(2))) {
11772 uint32_t Sel = getConstantPermuteMask(Mask);
11773 if (!Sel)
11774 return SDValue();
11775
11776 // Select 0xc for all zero bytes
11777 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11778 SDLoc DL(N);
11779 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11780 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11781 }
11782 }
11783
11784 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11785 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11786 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11787 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11788 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11789
11790 SDValue X = LHS.getOperand(0);
11791 SDValue Y = RHS.getOperand(0);
11792 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11793 !isTypeLegal(X.getValueType()))
11794 return SDValue();
11795
11796 if (LCC == ISD::SETO) {
11797 if (X != LHS.getOperand(1))
11798 return SDValue();
11799
11800 if (RCC == ISD::SETUNE) {
11801 const ConstantFPSDNode *C1 =
11802 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11803 if (!C1 || !C1->isInfinity() || C1->isNegative())
11804 return SDValue();
11805
11810
11811 static_assert(
11814 0x3ff) == Mask,
11815 "mask not equal");
11816
11817 SDLoc DL(N);
11818 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11819 DAG.getConstant(Mask, DL, MVT::i32));
11820 }
11821 }
11822 }
11823
11824 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11825 std::swap(LHS, RHS);
11826
11827 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11828 RHS.hasOneUse()) {
11829 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11830 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11831 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11832 // | n_nan)
11833 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11834 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11835 (RHS.getOperand(0) == LHS.getOperand(0) &&
11836 LHS.getOperand(0) == LHS.getOperand(1))) {
11837 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11838 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11839 : Mask->getZExtValue() & OrdMask;
11840
11841 SDLoc DL(N);
11842 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11843 DAG.getConstant(NewMask, DL, MVT::i32));
11844 }
11845 }
11846
11847 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11848 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11849 // and x, (sext cc from i1) => select cc, x, 0
11850 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11851 std::swap(LHS, RHS);
11852 if (isBoolSGPR(RHS.getOperand(0)))
11853 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11854 DAG.getConstant(0, SDLoc(N), MVT::i32));
11855 }
11856
11857 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11859 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11860 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11861 uint32_t LHSMask = getPermuteMask(LHS);
11862 uint32_t RHSMask = getPermuteMask(RHS);
11863 if (LHSMask != ~0u && RHSMask != ~0u) {
11864 // Canonicalize the expression in an attempt to have fewer unique masks
11865 // and therefore fewer registers used to hold the masks.
11866 if (LHSMask > RHSMask) {
11867 std::swap(LHSMask, RHSMask);
11868 std::swap(LHS, RHS);
11869 }
11870
11871 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11872 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11873 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11874 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11875
11876 // Check of we need to combine values from two sources within a byte.
11877 if (!(LHSUsedLanes & RHSUsedLanes) &&
11878 // If we select high and lower word keep it for SDWA.
11879 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11880 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11881 // Each byte in each mask is either selector mask 0-3, or has higher
11882 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11883 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11884 // mask which is not 0xff wins. By anding both masks we have a correct
11885 // result except that 0x0c shall be corrected to give 0x0c only.
11886 uint32_t Mask = LHSMask & RHSMask;
11887 for (unsigned I = 0; I < 32; I += 8) {
11888 uint32_t ByteSel = 0xff << I;
11889 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11890 Mask &= (0x0c << I) & 0xffffffff;
11891 }
11892
11893 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11894 // or 0x0c.
11895 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11896 SDLoc DL(N);
11897
11898 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11899 RHS.getOperand(0),
11900 DAG.getConstant(Sel, DL, MVT::i32));
11901 }
11902 }
11903 }
11904
11905 return SDValue();
11906}
11907
11908// A key component of v_perm is a mapping between byte position of the src
11909// operands, and the byte position of the dest. To provide such, we need: 1. the
11910// node that provides x byte of the dest of the OR, and 2. the byte of the node
11911// used to provide that x byte. calculateByteProvider finds which node provides
11912// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11913// and finds an ultimate src and byte position For example: The supported
11914// LoadCombine pattern for vector loads is as follows
11915// t1
11916// or
11917// / \
11918// t2 t3
11919// zext shl
11920// | | \
11921// t4 t5 16
11922// or anyext
11923// / \ |
11924// t6 t7 t8
11925// srl shl or
11926// / | / \ / \
11927// t9 t10 t11 t12 t13 t14
11928// trunc* 8 trunc* 8 and and
11929// | | / | | \
11930// t15 t16 t17 t18 t19 t20
11931// trunc* 255 srl -256
11932// | / \
11933// t15 t15 16
11934//
11935// *In this example, the truncs are from i32->i16
11936//
11937// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11938// respectively. calculateSrcByte would find (given node) -> ultimate src &
11939// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11940// After finding the mapping, we can combine the tree into vperm t15, t16,
11941// 0x05000407
11942
11943// Find the source and byte position from a node.
11944// \p DestByte is the byte position of the dest of the or that the src
11945// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11946// dest of the or byte. \p Depth tracks how many recursive iterations we have
11947// performed.
11948static const std::optional<ByteProvider<SDValue>>
11949calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11950 unsigned Depth = 0) {
11951 // We may need to recursively traverse a series of SRLs
11952 if (Depth >= 6)
11953 return std::nullopt;
11954
11955 if (Op.getValueSizeInBits() < 8)
11956 return std::nullopt;
11957
11958 if (Op.getValueType().isVector())
11959 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11960
11961 switch (Op->getOpcode()) {
11962 case ISD::TRUNCATE: {
11963 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11964 }
11965
11966 case ISD::SIGN_EXTEND:
11967 case ISD::ZERO_EXTEND:
11969 SDValue NarrowOp = Op->getOperand(0);
11970 auto NarrowVT = NarrowOp.getValueType();
11971 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11972 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11973 NarrowVT = VTSign->getVT();
11974 }
11975 if (!NarrowVT.isByteSized())
11976 return std::nullopt;
11977 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11978
11979 if (SrcIndex >= NarrowByteWidth)
11980 return std::nullopt;
11981 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11982 }
11983
11984 case ISD::SRA:
11985 case ISD::SRL: {
11986 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11987 if (!ShiftOp)
11988 return std::nullopt;
11989
11990 uint64_t BitShift = ShiftOp->getZExtValue();
11991
11992 if (BitShift % 8 != 0)
11993 return std::nullopt;
11994
11995 SrcIndex += BitShift / 8;
11996
11997 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11998 }
11999
12000 default: {
12001 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12002 }
12003 }
12004 llvm_unreachable("fully handled switch");
12005}
12006
12007// For a byte position in the result of an Or, traverse the tree and find the
12008// node (and the byte of the node) which ultimately provides this {Or,
12009// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12010// the byte position of the Op that corresponds with the originally requested
12011// byte of the Or \p Depth tracks how many recursive iterations we have
12012// performed. \p StartingIndex is the originally requested byte of the Or
12013static const std::optional<ByteProvider<SDValue>>
12014calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12015 unsigned StartingIndex = 0) {
12016 // Finding Src tree of RHS of or typically requires at least 1 additional
12017 // depth
12018 if (Depth > 6)
12019 return std::nullopt;
12020
12021 unsigned BitWidth = Op.getScalarValueSizeInBits();
12022 if (BitWidth % 8 != 0)
12023 return std::nullopt;
12024 if (Index > BitWidth / 8 - 1)
12025 return std::nullopt;
12026
12027 bool IsVec = Op.getValueType().isVector();
12028 switch (Op.getOpcode()) {
12029 case ISD::OR: {
12030 if (IsVec)
12031 return std::nullopt;
12032
12033 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12034 StartingIndex);
12035 if (!RHS)
12036 return std::nullopt;
12037 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12038 StartingIndex);
12039 if (!LHS)
12040 return std::nullopt;
12041 // A well formed Or will have two ByteProviders for each byte, one of which
12042 // is constant zero
12043 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12044 return std::nullopt;
12045 if (!LHS || LHS->isConstantZero())
12046 return RHS;
12047 if (!RHS || RHS->isConstantZero())
12048 return LHS;
12049 return std::nullopt;
12050 }
12051
12052 case ISD::AND: {
12053 if (IsVec)
12054 return std::nullopt;
12055
12056 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12057 if (!BitMaskOp)
12058 return std::nullopt;
12059
12060 uint32_t BitMask = BitMaskOp->getZExtValue();
12061 // Bits we expect for our StartingIndex
12062 uint32_t IndexMask = 0xFF << (Index * 8);
12063
12064 if ((IndexMask & BitMask) != IndexMask) {
12065 // If the result of the and partially provides the byte, then it
12066 // is not well formatted
12067 if (IndexMask & BitMask)
12068 return std::nullopt;
12070 }
12071
12072 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12073 }
12074
12075 case ISD::FSHR: {
12076 if (IsVec)
12077 return std::nullopt;
12078
12079 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12080 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12081 if (!ShiftOp || Op.getValueType().isVector())
12082 return std::nullopt;
12083
12084 uint64_t BitsProvided = Op.getValueSizeInBits();
12085 if (BitsProvided % 8 != 0)
12086 return std::nullopt;
12087
12088 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12089 if (BitShift % 8)
12090 return std::nullopt;
12091
12092 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12093 uint64_t ByteShift = BitShift / 8;
12094
12095 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12096 uint64_t BytesProvided = BitsProvided / 8;
12097 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12098 NewIndex %= BytesProvided;
12099 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12100 }
12101
12102 case ISD::SRA:
12103 case ISD::SRL: {
12104 if (IsVec)
12105 return std::nullopt;
12106
12107 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12108 if (!ShiftOp)
12109 return std::nullopt;
12110
12111 uint64_t BitShift = ShiftOp->getZExtValue();
12112 if (BitShift % 8)
12113 return std::nullopt;
12114
12115 auto BitsProvided = Op.getScalarValueSizeInBits();
12116 if (BitsProvided % 8 != 0)
12117 return std::nullopt;
12118
12119 uint64_t BytesProvided = BitsProvided / 8;
12120 uint64_t ByteShift = BitShift / 8;
12121 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12122 // If the byte we are trying to provide (as tracked by index) falls in this
12123 // range, then the SRL provides the byte. The byte of interest of the src of
12124 // the SRL is Index + ByteShift
12125 return BytesProvided - ByteShift > Index
12126 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12127 Index + ByteShift)
12129 }
12130
12131 case ISD::SHL: {
12132 if (IsVec)
12133 return std::nullopt;
12134
12135 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12136 if (!ShiftOp)
12137 return std::nullopt;
12138
12139 uint64_t BitShift = ShiftOp->getZExtValue();
12140 if (BitShift % 8 != 0)
12141 return std::nullopt;
12142 uint64_t ByteShift = BitShift / 8;
12143
12144 // If we are shifting by an amount greater than (or equal to)
12145 // the index we are trying to provide, then it provides 0s. If not,
12146 // then this bytes are not definitively 0s, and the corresponding byte
12147 // of interest is Index - ByteShift of the src
12148 return Index < ByteShift
12150 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12151 Depth + 1, StartingIndex);
12152 }
12153 case ISD::ANY_EXTEND:
12154 case ISD::SIGN_EXTEND:
12155 case ISD::ZERO_EXTEND:
12157 case ISD::AssertZext:
12158 case ISD::AssertSext: {
12159 if (IsVec)
12160 return std::nullopt;
12161
12162 SDValue NarrowOp = Op->getOperand(0);
12163 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12164 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12165 Op->getOpcode() == ISD::AssertZext ||
12166 Op->getOpcode() == ISD::AssertSext) {
12167 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12168 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12169 }
12170 if (NarrowBitWidth % 8 != 0)
12171 return std::nullopt;
12172 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12173
12174 if (Index >= NarrowByteWidth)
12175 return Op.getOpcode() == ISD::ZERO_EXTEND
12176 ? std::optional<ByteProvider<SDValue>>(
12178 : std::nullopt;
12179 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12180 }
12181
12182 case ISD::TRUNCATE: {
12183 if (IsVec)
12184 return std::nullopt;
12185
12186 uint64_t NarrowByteWidth = BitWidth / 8;
12187
12188 if (NarrowByteWidth >= Index) {
12189 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12190 StartingIndex);
12191 }
12192
12193 return std::nullopt;
12194 }
12195
12196 case ISD::CopyFromReg: {
12197 if (BitWidth / 8 > Index)
12198 return calculateSrcByte(Op, StartingIndex, Index);
12199
12200 return std::nullopt;
12201 }
12202
12203 case ISD::LOAD: {
12204 auto *L = cast<LoadSDNode>(Op.getNode());
12205
12206 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12207 if (NarrowBitWidth % 8 != 0)
12208 return std::nullopt;
12209 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12210
12211 // If the width of the load does not reach byte we are trying to provide for
12212 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12213 // question
12214 if (Index >= NarrowByteWidth) {
12215 return L->getExtensionType() == ISD::ZEXTLOAD
12216 ? std::optional<ByteProvider<SDValue>>(
12218 : std::nullopt;
12219 }
12220
12221 if (NarrowByteWidth > Index) {
12222 return calculateSrcByte(Op, StartingIndex, Index);
12223 }
12224
12225 return std::nullopt;
12226 }
12227
12228 case ISD::BSWAP: {
12229 if (IsVec)
12230 return std::nullopt;
12231
12232 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12233 Depth + 1, StartingIndex);
12234 }
12235
12237 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12238 if (!IdxOp)
12239 return std::nullopt;
12240 auto VecIdx = IdxOp->getZExtValue();
12241 auto ScalarSize = Op.getScalarValueSizeInBits();
12242 if (ScalarSize < 32)
12243 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12244 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12245 StartingIndex, Index);
12246 }
12247
12248 case AMDGPUISD::PERM: {
12249 if (IsVec)
12250 return std::nullopt;
12251
12252 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12253 if (!PermMask)
12254 return std::nullopt;
12255
12256 auto IdxMask =
12257 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12258 if (IdxMask > 0x07 && IdxMask != 0x0c)
12259 return std::nullopt;
12260
12261 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12262 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12263
12264 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12267 }
12268
12269 default: {
12270 return std::nullopt;
12271 }
12272 }
12273
12274 llvm_unreachable("fully handled switch");
12275}
12276
12277// Returns true if the Operand is a scalar and is 16 bits
12278static bool isExtendedFrom16Bits(SDValue &Operand) {
12279
12280 switch (Operand.getOpcode()) {
12281 case ISD::ANY_EXTEND:
12282 case ISD::SIGN_EXTEND:
12283 case ISD::ZERO_EXTEND: {
12284 auto OpVT = Operand.getOperand(0).getValueType();
12285 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12286 }
12287 case ISD::LOAD: {
12288 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12289 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12290 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12291 ExtType == ISD::EXTLOAD) {
12292 auto MemVT = L->getMemoryVT();
12293 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12294 }
12295 return L->getMemoryVT().getSizeInBits() == 16;
12296 }
12297 default:
12298 return false;
12299 }
12300}
12301
12302// Returns true if the mask matches consecutive bytes, and the first byte
12303// begins at a power of 2 byte offset from 0th byte
12304static bool addresses16Bits(int Mask) {
12305 int Low8 = Mask & 0xff;
12306 int Hi8 = (Mask & 0xff00) >> 8;
12307
12308 assert(Low8 < 8 && Hi8 < 8);
12309 // Are the bytes contiguous in the order of increasing addresses.
12310 bool IsConsecutive = (Hi8 - Low8 == 1);
12311 // Is the first byte at location that is aligned for 16 bit instructions.
12312 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12313 // In this case, we still need code to extract the 16 bit operand, so it
12314 // is better to use i8 v_perm
12315 bool Is16Aligned = !(Low8 % 2);
12316
12317 return IsConsecutive && Is16Aligned;
12318}
12319
12320// Do not lower into v_perm if the operands are actually 16 bit
12321// and the selected bits (based on PermMask) correspond with two
12322// easily addressable 16 bit operands.
12324 SDValue &OtherOp) {
12325 int Low16 = PermMask & 0xffff;
12326 int Hi16 = (PermMask & 0xffff0000) >> 16;
12327
12328 auto TempOp = peekThroughBitcasts(Op);
12329 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12330
12331 auto OpIs16Bit =
12332 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12333 if (!OpIs16Bit)
12334 return true;
12335
12336 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12337 isExtendedFrom16Bits(TempOtherOp);
12338 if (!OtherOpIs16Bit)
12339 return true;
12340
12341 // Do we cleanly address both
12342 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12343}
12344
12346 unsigned DWordOffset) {
12347 SDValue Ret;
12348
12349 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12350 // ByteProvider must be at least 8 bits
12351 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12352
12353 if (TypeSize <= 32)
12354 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12355
12356 if (Src.getValueType().isVector()) {
12357 auto ScalarTySize = Src.getScalarValueSizeInBits();
12358 auto ScalarTy = Src.getValueType().getScalarType();
12359 if (ScalarTySize == 32) {
12360 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12361 DAG.getConstant(DWordOffset, SL, MVT::i32));
12362 }
12363 if (ScalarTySize > 32) {
12364 Ret = DAG.getNode(
12365 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12366 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12367 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12368 if (ShiftVal)
12369 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12370 DAG.getConstant(ShiftVal, SL, MVT::i32));
12371 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12372 }
12373
12374 assert(ScalarTySize < 32);
12375 auto NumElements = TypeSize / ScalarTySize;
12376 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12377 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12378 auto NumElementsIn32 = 32 / ScalarTySize;
12379 auto NumAvailElements = DWordOffset < Trunc32Elements
12380 ? NumElementsIn32
12381 : NumElements - NormalizedTrunc;
12382
12384 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12385 NumAvailElements);
12386
12387 Ret = DAG.getBuildVector(
12388 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12389 VecSrcs);
12390 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12391 }
12392
12393 /// Scalar Type
12394 auto ShiftVal = 32 * DWordOffset;
12395 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12396 DAG.getConstant(ShiftVal, SL, MVT::i32));
12397 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12398}
12399
12401 SelectionDAG &DAG = DCI.DAG;
12402 [[maybe_unused]] EVT VT = N->getValueType(0);
12404
12405 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12406 assert(VT == MVT::i32);
12407 for (int i = 0; i < 4; i++) {
12408 // Find the ByteProvider that provides the ith byte of the result of OR
12409 std::optional<ByteProvider<SDValue>> P =
12410 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12411 // TODO support constantZero
12412 if (!P || P->isConstantZero())
12413 return SDValue();
12414
12415 PermNodes.push_back(*P);
12416 }
12417 if (PermNodes.size() != 4)
12418 return SDValue();
12419
12420 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12421 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12422 uint64_t PermMask = 0x00000000;
12423 for (size_t i = 0; i < PermNodes.size(); i++) {
12424 auto PermOp = PermNodes[i];
12425 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12426 // by sizeof(Src2) = 4
12427 int SrcByteAdjust = 4;
12428
12429 // If the Src uses a byte from a different DWORD, then it corresponds
12430 // with a difference source
12431 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12432 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12433 if (SecondSrc)
12434 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12435 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12436 return SDValue();
12437
12438 // Set the index of the second distinct Src node
12439 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12440 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12441 SrcByteAdjust = 0;
12442 }
12443 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12445 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12446 }
12447 SDLoc DL(N);
12448 SDValue Op = *PermNodes[FirstSrc.first].Src;
12449 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12450 assert(Op.getValueSizeInBits() == 32);
12451
12452 // Check that we are not just extracting the bytes in order from an op
12453 if (!SecondSrc) {
12454 int Low16 = PermMask & 0xffff;
12455 int Hi16 = (PermMask & 0xffff0000) >> 16;
12456
12457 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12458 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12459
12460 // The perm op would really just produce Op. So combine into Op
12461 if (WellFormedLow && WellFormedHi)
12462 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12463 }
12464
12465 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12466
12467 if (SecondSrc) {
12468 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12469 assert(OtherOp.getValueSizeInBits() == 32);
12470 }
12471
12472 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12473
12474 assert(Op.getValueType().isByteSized() &&
12475 OtherOp.getValueType().isByteSized());
12476
12477 // If the ultimate src is less than 32 bits, then we will only be
12478 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12479 // CalculateByteProvider would not have returned Op as source if we
12480 // used a byte that is outside its ValueType. Thus, we are free to
12481 // ANY_EXTEND as the extended bits are dont-cares.
12482 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12483 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12484
12485 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12486 DAG.getConstant(PermMask, DL, MVT::i32));
12487 }
12488 return SDValue();
12489}
12490
12491SDValue SITargetLowering::performOrCombine(SDNode *N,
12492 DAGCombinerInfo &DCI) const {
12493 SelectionDAG &DAG = DCI.DAG;
12494 SDValue LHS = N->getOperand(0);
12495 SDValue RHS = N->getOperand(1);
12496
12497 EVT VT = N->getValueType(0);
12498 if (VT == MVT::i1) {
12499 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12500 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12501 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12502 SDValue Src = LHS.getOperand(0);
12503 if (Src != RHS.getOperand(0))
12504 return SDValue();
12505
12506 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12507 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12508 if (!CLHS || !CRHS)
12509 return SDValue();
12510
12511 // Only 10 bits are used.
12512 static const uint32_t MaxMask = 0x3ff;
12513
12514 uint32_t NewMask =
12515 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12516 SDLoc DL(N);
12517 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12518 DAG.getConstant(NewMask, DL, MVT::i32));
12519 }
12520
12521 return SDValue();
12522 }
12523
12524 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12525 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12526 LHS.getOpcode() == AMDGPUISD::PERM &&
12527 isa<ConstantSDNode>(LHS.getOperand(2))) {
12528 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12529 if (!Sel)
12530 return SDValue();
12531
12532 Sel |= LHS.getConstantOperandVal(2);
12533 SDLoc DL(N);
12534 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12535 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12536 }
12537
12538 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12540 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12541 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12542
12543 // If all the uses of an or need to extract the individual elements, do not
12544 // attempt to lower into v_perm
12545 auto usesCombinedOperand = [](SDNode *OrUse) {
12546 // If we have any non-vectorized use, then it is a candidate for v_perm
12547 if (OrUse->getOpcode() != ISD::BITCAST ||
12548 !OrUse->getValueType(0).isVector())
12549 return true;
12550
12551 // If we have any non-vectorized use, then it is a candidate for v_perm
12552 for (auto *VUser : OrUse->users()) {
12553 if (!VUser->getValueType(0).isVector())
12554 return true;
12555
12556 // If the use of a vector is a store, then combining via a v_perm
12557 // is beneficial.
12558 // TODO -- whitelist more uses
12559 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12560 if (VUser->getOpcode() == VectorwiseOp)
12561 return true;
12562 }
12563 return false;
12564 };
12565
12566 if (!any_of(N->users(), usesCombinedOperand))
12567 return SDValue();
12568
12569 uint32_t LHSMask = getPermuteMask(LHS);
12570 uint32_t RHSMask = getPermuteMask(RHS);
12571
12572 if (LHSMask != ~0u && RHSMask != ~0u) {
12573 // Canonicalize the expression in an attempt to have fewer unique masks
12574 // and therefore fewer registers used to hold the masks.
12575 if (LHSMask > RHSMask) {
12576 std::swap(LHSMask, RHSMask);
12577 std::swap(LHS, RHS);
12578 }
12579
12580 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12581 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12582 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12583 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12584
12585 // Check of we need to combine values from two sources within a byte.
12586 if (!(LHSUsedLanes & RHSUsedLanes) &&
12587 // If we select high and lower word keep it for SDWA.
12588 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12589 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12590 // Kill zero bytes selected by other mask. Zero value is 0xc.
12591 LHSMask &= ~RHSUsedLanes;
12592 RHSMask &= ~LHSUsedLanes;
12593 // Add 4 to each active LHS lane
12594 LHSMask |= LHSUsedLanes & 0x04040404;
12595 // Combine masks
12596 uint32_t Sel = LHSMask | RHSMask;
12597 SDLoc DL(N);
12598
12599 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12600 RHS.getOperand(0),
12601 DAG.getConstant(Sel, DL, MVT::i32));
12602 }
12603 }
12604 if (LHSMask == ~0u || RHSMask == ~0u) {
12605 if (SDValue Perm = matchPERM(N, DCI))
12606 return Perm;
12607 }
12608 }
12609
12610 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12611 return SDValue();
12612
12613 // TODO: This could be a generic combine with a predicate for extracting the
12614 // high half of an integer being free.
12615
12616 // (or i64:x, (zero_extend i32:y)) ->
12617 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12618 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12619 RHS.getOpcode() != ISD::ZERO_EXTEND)
12620 std::swap(LHS, RHS);
12621
12622 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12623 SDValue ExtSrc = RHS.getOperand(0);
12624 EVT SrcVT = ExtSrc.getValueType();
12625 if (SrcVT == MVT::i32) {
12626 SDLoc SL(N);
12627 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12628 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12629
12630 DCI.AddToWorklist(LowOr.getNode());
12631 DCI.AddToWorklist(HiBits.getNode());
12632
12633 SDValue Vec =
12634 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12635 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12636 }
12637 }
12638
12639 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12640 if (CRHS) {
12641 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12642 N->getOperand(0), CRHS))
12643 return Split;
12644 }
12645
12646 return SDValue();
12647}
12648
12649SDValue SITargetLowering::performXorCombine(SDNode *N,
12650 DAGCombinerInfo &DCI) const {
12651 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12652 return RV;
12653
12654 SDValue LHS = N->getOperand(0);
12655 SDValue RHS = N->getOperand(1);
12656
12657 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12658 SelectionDAG &DAG = DCI.DAG;
12659
12660 EVT VT = N->getValueType(0);
12661 if (CRHS && VT == MVT::i64) {
12662 if (SDValue Split =
12663 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12664 return Split;
12665 }
12666
12667 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12668 // fneg-like xors into 64-bit select.
12669 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12670 // This looks like an fneg, try to fold as a source modifier.
12671 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12672 shouldFoldFNegIntoSrc(N, LHS)) {
12673 // xor (select c, a, b), 0x80000000 ->
12674 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12675 SDLoc DL(N);
12676 SDValue CastLHS =
12677 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12678 SDValue CastRHS =
12679 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12680 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12681 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12682 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12683 LHS->getOperand(0), FNegLHS, FNegRHS);
12684 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12685 }
12686 }
12687
12688 return SDValue();
12689}
12690
12691SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12692 DAGCombinerInfo &DCI) const {
12693 if (!Subtarget->has16BitInsts() ||
12694 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12695 return SDValue();
12696
12697 EVT VT = N->getValueType(0);
12698 if (VT != MVT::i32)
12699 return SDValue();
12700
12701 SDValue Src = N->getOperand(0);
12702 if (Src.getValueType() != MVT::i16)
12703 return SDValue();
12704
12705 return SDValue();
12706}
12707
12708SDValue
12709SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12710 DAGCombinerInfo &DCI) const {
12711 SDValue Src = N->getOperand(0);
12712 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12713
12714 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12715 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12716 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12717 VTSign->getVT() == MVT::i8) ||
12718 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12719 VTSign->getVT() == MVT::i16))) {
12720 assert(Subtarget->hasScalarSubwordLoads() &&
12721 "s_buffer_load_{u8, i8} are supported "
12722 "in GFX12 (or newer) architectures.");
12723 EVT VT = Src.getValueType();
12724 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12727 SDLoc DL(N);
12728 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12729 SDValue Ops[] = {
12730 Src.getOperand(0), // source register
12731 Src.getOperand(1), // offset
12732 Src.getOperand(2) // cachePolicy
12733 };
12734 auto *M = cast<MemSDNode>(Src);
12735 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12736 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12737 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12738 return LoadVal;
12739 }
12740 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12741 VTSign->getVT() == MVT::i8) ||
12742 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12743 VTSign->getVT() == MVT::i16)) &&
12744 Src.hasOneUse()) {
12745 auto *M = cast<MemSDNode>(Src);
12746 SDValue Ops[] = {Src.getOperand(0), // Chain
12747 Src.getOperand(1), // rsrc
12748 Src.getOperand(2), // vindex
12749 Src.getOperand(3), // voffset
12750 Src.getOperand(4), // soffset
12751 Src.getOperand(5), // offset
12752 Src.getOperand(6), Src.getOperand(7)};
12753 // replace with BUFFER_LOAD_BYTE/SHORT
12754 SDVTList ResList =
12755 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12756 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12759 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12760 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12761 return DCI.DAG.getMergeValues(
12762 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12763 }
12764 return SDValue();
12765}
12766
12767SDValue SITargetLowering::performClassCombine(SDNode *N,
12768 DAGCombinerInfo &DCI) const {
12769 SelectionDAG &DAG = DCI.DAG;
12770 SDValue Mask = N->getOperand(1);
12771
12772 // fp_class x, 0 -> false
12773 if (isNullConstant(Mask))
12774 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12775
12776 if (N->getOperand(0).isUndef())
12777 return DAG.getUNDEF(MVT::i1);
12778
12779 return SDValue();
12780}
12781
12782SDValue SITargetLowering::performRcpCombine(SDNode *N,
12783 DAGCombinerInfo &DCI) const {
12784 EVT VT = N->getValueType(0);
12785 SDValue N0 = N->getOperand(0);
12786
12787 if (N0.isUndef()) {
12788 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12789 SDLoc(N), VT);
12790 }
12791
12792 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12793 N0.getOpcode() == ISD::SINT_TO_FP)) {
12794 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12795 N->getFlags());
12796 }
12797
12798 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12799 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12800 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12801 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12802 N->getFlags());
12803 }
12804
12806}
12807
12809 unsigned MaxDepth) const {
12810 unsigned Opcode = Op.getOpcode();
12811 if (Opcode == ISD::FCANONICALIZE)
12812 return true;
12813
12814 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12815 const auto &F = CFP->getValueAPF();
12816 if (F.isNaN() && F.isSignaling())
12817 return false;
12818 if (!F.isDenormal())
12819 return true;
12820
12821 DenormalMode Mode =
12822 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12823 return Mode == DenormalMode::getIEEE();
12824 }
12825
12826 // If source is a result of another standard FP operation it is already in
12827 // canonical form.
12828 if (MaxDepth == 0)
12829 return false;
12830
12831 switch (Opcode) {
12832 // These will flush denorms if required.
12833 case ISD::FADD:
12834 case ISD::FSUB:
12835 case ISD::FMUL:
12836 case ISD::FCEIL:
12837 case ISD::FFLOOR:
12838 case ISD::FMA:
12839 case ISD::FMAD:
12840 case ISD::FSQRT:
12841 case ISD::FDIV:
12842 case ISD::FREM:
12843 case ISD::FP_ROUND:
12844 case ISD::FP_EXTEND:
12845 case ISD::FP16_TO_FP:
12846 case ISD::FP_TO_FP16:
12847 case ISD::BF16_TO_FP:
12848 case ISD::FP_TO_BF16:
12849 case ISD::FLDEXP:
12852 case AMDGPUISD::RCP:
12853 case AMDGPUISD::RSQ:
12857 case AMDGPUISD::LOG:
12858 case AMDGPUISD::EXP:
12862 case AMDGPUISD::FRACT:
12869 case AMDGPUISD::SIN_HW:
12870 case AMDGPUISD::COS_HW:
12871 return true;
12872
12873 // It can/will be lowered or combined as a bit operation.
12874 // Need to check their input recursively to handle.
12875 case ISD::FNEG:
12876 case ISD::FABS:
12877 case ISD::FCOPYSIGN:
12878 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12879
12880 case ISD::AND:
12881 if (Op.getValueType() == MVT::i32) {
12882 // Be careful as we only know it is a bitcast floating point type. It
12883 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12884 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12885 // is valid to optimize for all types.
12886 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12887 if (RHS->getZExtValue() == 0xffff0000) {
12888 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12889 }
12890 }
12891 }
12892 break;
12893
12894 case ISD::FSIN:
12895 case ISD::FCOS:
12896 case ISD::FSINCOS:
12897 return Op.getValueType().getScalarType() != MVT::f16;
12898
12899 case ISD::FMINNUM:
12900 case ISD::FMAXNUM:
12901 case ISD::FMINNUM_IEEE:
12902 case ISD::FMAXNUM_IEEE:
12903 case ISD::FMINIMUM:
12904 case ISD::FMAXIMUM:
12905 case AMDGPUISD::CLAMP:
12906 case AMDGPUISD::FMED3:
12907 case AMDGPUISD::FMAX3:
12908 case AMDGPUISD::FMIN3:
12910 case AMDGPUISD::FMINIMUM3: {
12911 // FIXME: Shouldn't treat the generic operations different based these.
12912 // However, we aren't really required to flush the result from
12913 // minnum/maxnum..
12914
12915 // snans will be quieted, so we only need to worry about denormals.
12916 if (Subtarget->supportsMinMaxDenormModes() ||
12917 // FIXME: denormalsEnabledForType is broken for dynamic
12918 denormalsEnabledForType(DAG, Op.getValueType()))
12919 return true;
12920
12921 // Flushing may be required.
12922 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12923 // targets need to check their input recursively.
12924
12925 // FIXME: Does this apply with clamp? It's implemented with max.
12926 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12927 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12928 return false;
12929 }
12930
12931 return true;
12932 }
12933 case ISD::SELECT: {
12934 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12935 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12936 }
12937 case ISD::BUILD_VECTOR: {
12938 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12939 SDValue SrcOp = Op.getOperand(i);
12940 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12941 return false;
12942 }
12943
12944 return true;
12945 }
12948 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12949 }
12951 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12952 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12953 }
12954 case ISD::UNDEF:
12955 // Could be anything.
12956 return false;
12957
12958 case ISD::BITCAST:
12959 // TODO: This is incorrect as it loses track of the operand's type. We may
12960 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12961 // same bits that are canonicalized in one type need not be in the other.
12962 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12963 case ISD::TRUNCATE: {
12964 // Hack round the mess we make when legalizing extract_vector_elt
12965 if (Op.getValueType() == MVT::i16) {
12966 SDValue TruncSrc = Op.getOperand(0);
12967 if (TruncSrc.getValueType() == MVT::i32 &&
12968 TruncSrc.getOpcode() == ISD::BITCAST &&
12969 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12970 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12971 }
12972 }
12973 return false;
12974 }
12976 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12977 // TODO: Handle more intrinsics
12978 switch (IntrinsicID) {
12979 case Intrinsic::amdgcn_cvt_pkrtz:
12980 case Intrinsic::amdgcn_cubeid:
12981 case Intrinsic::amdgcn_frexp_mant:
12982 case Intrinsic::amdgcn_fdot2:
12983 case Intrinsic::amdgcn_rcp:
12984 case Intrinsic::amdgcn_rsq:
12985 case Intrinsic::amdgcn_rsq_clamp:
12986 case Intrinsic::amdgcn_rcp_legacy:
12987 case Intrinsic::amdgcn_rsq_legacy:
12988 case Intrinsic::amdgcn_trig_preop:
12989 case Intrinsic::amdgcn_log:
12990 case Intrinsic::amdgcn_exp2:
12991 case Intrinsic::amdgcn_sqrt:
12992 return true;
12993 default:
12994 break;
12995 }
12996
12997 break;
12998 }
12999 default:
13000 break;
13001 }
13002
13003 // FIXME: denormalsEnabledForType is broken for dynamic
13004 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13005 DAG.isKnownNeverSNaN(Op);
13006}
13007
13009 unsigned MaxDepth) const {
13010 const MachineRegisterInfo &MRI = MF.getRegInfo();
13011 MachineInstr *MI = MRI.getVRegDef(Reg);
13012 unsigned Opcode = MI->getOpcode();
13013
13014 if (Opcode == AMDGPU::G_FCANONICALIZE)
13015 return true;
13016
13017 std::optional<FPValueAndVReg> FCR;
13018 // Constant splat (can be padded with undef) or scalar constant.
13019 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13020 if (FCR->Value.isSignaling())
13021 return false;
13022 if (!FCR->Value.isDenormal())
13023 return true;
13024
13025 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13026 return Mode == DenormalMode::getIEEE();
13027 }
13028
13029 if (MaxDepth == 0)
13030 return false;
13031
13032 switch (Opcode) {
13033 case AMDGPU::G_FADD:
13034 case AMDGPU::G_FSUB:
13035 case AMDGPU::G_FMUL:
13036 case AMDGPU::G_FCEIL:
13037 case AMDGPU::G_FFLOOR:
13038 case AMDGPU::G_FRINT:
13039 case AMDGPU::G_FNEARBYINT:
13040 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13041 case AMDGPU::G_INTRINSIC_TRUNC:
13042 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13043 case AMDGPU::G_FMA:
13044 case AMDGPU::G_FMAD:
13045 case AMDGPU::G_FSQRT:
13046 case AMDGPU::G_FDIV:
13047 case AMDGPU::G_FREM:
13048 case AMDGPU::G_FPOW:
13049 case AMDGPU::G_FPEXT:
13050 case AMDGPU::G_FLOG:
13051 case AMDGPU::G_FLOG2:
13052 case AMDGPU::G_FLOG10:
13053 case AMDGPU::G_FPTRUNC:
13054 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13055 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13056 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13057 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13058 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13059 return true;
13060 case AMDGPU::G_FNEG:
13061 case AMDGPU::G_FABS:
13062 case AMDGPU::G_FCOPYSIGN:
13063 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13064 case AMDGPU::G_FMINNUM:
13065 case AMDGPU::G_FMAXNUM:
13066 case AMDGPU::G_FMINNUM_IEEE:
13067 case AMDGPU::G_FMAXNUM_IEEE:
13068 case AMDGPU::G_FMINIMUM:
13069 case AMDGPU::G_FMAXIMUM: {
13070 if (Subtarget->supportsMinMaxDenormModes() ||
13071 // FIXME: denormalsEnabledForType is broken for dynamic
13072 denormalsEnabledForType(MRI.getType(Reg), MF))
13073 return true;
13074
13075 [[fallthrough]];
13076 }
13077 case AMDGPU::G_BUILD_VECTOR:
13078 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13079 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13080 return false;
13081 return true;
13082 case AMDGPU::G_INTRINSIC:
13083 case AMDGPU::G_INTRINSIC_CONVERGENT:
13084 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13085 case Intrinsic::amdgcn_fmul_legacy:
13086 case Intrinsic::amdgcn_fmad_ftz:
13087 case Intrinsic::amdgcn_sqrt:
13088 case Intrinsic::amdgcn_fmed3:
13089 case Intrinsic::amdgcn_sin:
13090 case Intrinsic::amdgcn_cos:
13091 case Intrinsic::amdgcn_log:
13092 case Intrinsic::amdgcn_exp2:
13093 case Intrinsic::amdgcn_log_clamp:
13094 case Intrinsic::amdgcn_rcp:
13095 case Intrinsic::amdgcn_rcp_legacy:
13096 case Intrinsic::amdgcn_rsq:
13097 case Intrinsic::amdgcn_rsq_clamp:
13098 case Intrinsic::amdgcn_rsq_legacy:
13099 case Intrinsic::amdgcn_div_scale:
13100 case Intrinsic::amdgcn_div_fmas:
13101 case Intrinsic::amdgcn_div_fixup:
13102 case Intrinsic::amdgcn_fract:
13103 case Intrinsic::amdgcn_cvt_pkrtz:
13104 case Intrinsic::amdgcn_cubeid:
13105 case Intrinsic::amdgcn_cubema:
13106 case Intrinsic::amdgcn_cubesc:
13107 case Intrinsic::amdgcn_cubetc:
13108 case Intrinsic::amdgcn_frexp_mant:
13109 case Intrinsic::amdgcn_fdot2:
13110 case Intrinsic::amdgcn_trig_preop:
13111 return true;
13112 default:
13113 break;
13114 }
13115
13116 [[fallthrough]];
13117 default:
13118 return false;
13119 }
13120
13121 llvm_unreachable("invalid operation");
13122}
13123
13124// Constant fold canonicalize.
13125SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13126 const SDLoc &SL, EVT VT,
13127 const APFloat &C) const {
13128 // Flush denormals to 0 if not enabled.
13129 if (C.isDenormal()) {
13130 DenormalMode Mode =
13131 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13132 if (Mode == DenormalMode::getPreserveSign()) {
13133 return DAG.getConstantFP(
13134 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13135 }
13136
13137 if (Mode != DenormalMode::getIEEE())
13138 return SDValue();
13139 }
13140
13141 if (C.isNaN()) {
13142 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13143 if (C.isSignaling()) {
13144 // Quiet a signaling NaN.
13145 // FIXME: Is this supposed to preserve payload bits?
13146 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13147 }
13148
13149 // Make sure it is the canonical NaN bitpattern.
13150 //
13151 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13152 // immediate?
13153 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13154 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13155 }
13156
13157 // Already canonical.
13158 return DAG.getConstantFP(C, SL, VT);
13159}
13160
13162 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13163}
13164
13165SDValue
13166SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13167 DAGCombinerInfo &DCI) const {
13168 SelectionDAG &DAG = DCI.DAG;
13169 SDValue N0 = N->getOperand(0);
13170 EVT VT = N->getValueType(0);
13171
13172 // fcanonicalize undef -> qnan
13173 if (N0.isUndef()) {
13175 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13176 }
13177
13178 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13179 EVT VT = N->getValueType(0);
13180 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13181 }
13182
13183 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13184 // (fcanonicalize k)
13185 //
13186 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13187
13188 // TODO: This could be better with wider vectors that will be split to v2f16,
13189 // and to consider uses since there aren't that many packed operations.
13190 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13191 isTypeLegal(MVT::v2f16)) {
13192 SDLoc SL(N);
13193 SDValue NewElts[2];
13194 SDValue Lo = N0.getOperand(0);
13195 SDValue Hi = N0.getOperand(1);
13196 EVT EltVT = Lo.getValueType();
13197
13199 for (unsigned I = 0; I != 2; ++I) {
13200 SDValue Op = N0.getOperand(I);
13201 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13202 NewElts[I] =
13203 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13204 } else if (Op.isUndef()) {
13205 // Handled below based on what the other operand is.
13206 NewElts[I] = Op;
13207 } else {
13208 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13209 }
13210 }
13211
13212 // If one half is undef, and one is constant, prefer a splat vector rather
13213 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13214 // cheaper to use and may be free with a packed operation.
13215 if (NewElts[0].isUndef()) {
13216 if (isa<ConstantFPSDNode>(NewElts[1]))
13217 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13218 ? NewElts[1]
13219 : DAG.getConstantFP(0.0f, SL, EltVT);
13220 }
13221
13222 if (NewElts[1].isUndef()) {
13223 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13224 ? NewElts[0]
13225 : DAG.getConstantFP(0.0f, SL, EltVT);
13226 }
13227
13228 return DAG.getBuildVector(VT, SL, NewElts);
13229 }
13230 }
13231
13232 return SDValue();
13233}
13234
13235static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13236 switch (Opc) {
13237 case ISD::FMAXNUM:
13238 case ISD::FMAXNUM_IEEE:
13239 return AMDGPUISD::FMAX3;
13240 case ISD::FMAXIMUM:
13241 return AMDGPUISD::FMAXIMUM3;
13242 case ISD::SMAX:
13243 return AMDGPUISD::SMAX3;
13244 case ISD::UMAX:
13245 return AMDGPUISD::UMAX3;
13246 case ISD::FMINNUM:
13247 case ISD::FMINNUM_IEEE:
13248 return AMDGPUISD::FMIN3;
13249 case ISD::FMINIMUM:
13250 return AMDGPUISD::FMINIMUM3;
13251 case ISD::SMIN:
13252 return AMDGPUISD::SMIN3;
13253 case ISD::UMIN:
13254 return AMDGPUISD::UMIN3;
13255 default:
13256 llvm_unreachable("Not a min/max opcode");
13257 }
13258}
13259
13260SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13261 const SDLoc &SL, SDValue Src,
13262 SDValue MinVal,
13263 SDValue MaxVal,
13264 bool Signed) const {
13265
13266 // med3 comes from
13267 // min(max(x, K0), K1), K0 < K1
13268 // max(min(x, K0), K1), K1 < K0
13269 //
13270 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13271 // min/max op.
13272 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13273 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13274
13275 if (!MinK || !MaxK)
13276 return SDValue();
13277
13278 if (Signed) {
13279 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13280 return SDValue();
13281 } else {
13282 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13283 return SDValue();
13284 }
13285
13286 EVT VT = MinK->getValueType(0);
13287 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13288 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13289 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13290
13291 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13292 // not available, but this is unlikely to be profitable as constants
13293 // will often need to be materialized & extended, especially on
13294 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13295 return SDValue();
13296}
13297
13299 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13300 return C;
13301
13302 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13303 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13304 return C;
13305 }
13306
13307 return nullptr;
13308}
13309
13310SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13311 const SDLoc &SL, SDValue Op0,
13312 SDValue Op1) const {
13314 if (!K1)
13315 return SDValue();
13316
13318 if (!K0)
13319 return SDValue();
13320
13321 // Ordered >= (although NaN inputs should have folded away by now).
13322 if (K0->getValueAPF() > K1->getValueAPF())
13323 return SDValue();
13324
13325 const MachineFunction &MF = DAG.getMachineFunction();
13327
13328 // TODO: Check IEEE bit enabled?
13329 EVT VT = Op0.getValueType();
13330 if (Info->getMode().DX10Clamp) {
13331 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13332 // hardware fmed3 behavior converting to a min.
13333 // FIXME: Should this be allowing -0.0?
13334 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13335 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13336 }
13337
13338 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13339 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13340 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13341 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13342 // then give the other result, which is different from med3 with a NaN
13343 // input.
13344 SDValue Var = Op0.getOperand(0);
13345 if (!DAG.isKnownNeverSNaN(Var))
13346 return SDValue();
13347
13349
13350 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13351 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13352 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13353 SDValue(K0, 0), SDValue(K1, 0));
13354 }
13355 }
13356
13357 return SDValue();
13358}
13359
13360/// \return true if the subtarget supports minimum3 and maximum3 with the given
13361/// base min/max opcode \p Opc for type \p VT.
13362static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13363 EVT VT) {
13364 switch (Opc) {
13365 case ISD::FMINNUM:
13366 case ISD::FMAXNUM:
13367 case ISD::FMINNUM_IEEE:
13368 case ISD::FMAXNUM_IEEE:
13371 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13372 case ISD::FMINIMUM:
13373 case ISD::FMAXIMUM:
13374 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13375 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13376 case ISD::SMAX:
13377 case ISD::SMIN:
13378 case ISD::UMAX:
13379 case ISD::UMIN:
13380 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13381 default:
13382 return false;
13383 }
13384
13385 llvm_unreachable("not a min/max opcode");
13386}
13387
13388SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13389 DAGCombinerInfo &DCI) const {
13390 SelectionDAG &DAG = DCI.DAG;
13391
13392 EVT VT = N->getValueType(0);
13393 unsigned Opc = N->getOpcode();
13394 SDValue Op0 = N->getOperand(0);
13395 SDValue Op1 = N->getOperand(1);
13396
13397 // Only do this if the inner op has one use since this will just increases
13398 // register pressure for no benefit.
13399
13400 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13401 // max(max(a, b), c) -> max3(a, b, c)
13402 // min(min(a, b), c) -> min3(a, b, c)
13403 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13404 SDLoc DL(N);
13405 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13406 Op0.getOperand(0), Op0.getOperand(1), Op1);
13407 }
13408
13409 // Try commuted.
13410 // max(a, max(b, c)) -> max3(a, b, c)
13411 // min(a, min(b, c)) -> min3(a, b, c)
13412 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13413 SDLoc DL(N);
13414 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13415 Op0, Op1.getOperand(0), Op1.getOperand(1));
13416 }
13417 }
13418
13419 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13420 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13421 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13422 if (SDValue Med3 = performIntMed3ImmCombine(
13423 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13424 return Med3;
13425 }
13426 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13427 if (SDValue Med3 = performIntMed3ImmCombine(
13428 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13429 return Med3;
13430 }
13431
13432 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13433 if (SDValue Med3 = performIntMed3ImmCombine(
13434 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13435 return Med3;
13436 }
13437 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13438 if (SDValue Med3 = performIntMed3ImmCombine(
13439 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13440 return Med3;
13441 }
13442
13443 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13444 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13445 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13446 (Opc == AMDGPUISD::FMIN_LEGACY &&
13447 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13448 (VT == MVT::f32 || VT == MVT::f64 ||
13449 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13450 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13451 Op0.hasOneUse()) {
13452 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13453 return Res;
13454 }
13455
13456 return SDValue();
13457}
13458
13460 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13461 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13462 // FIXME: Should this be allowing -0.0?
13463 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13464 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13465 }
13466 }
13467
13468 return false;
13469}
13470
13471// FIXME: Should only worry about snans for version with chain.
13472SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13473 DAGCombinerInfo &DCI) const {
13474 EVT VT = N->getValueType(0);
13475 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13476 // NaNs. With a NaN input, the order of the operands may change the result.
13477
13478 SelectionDAG &DAG = DCI.DAG;
13479 SDLoc SL(N);
13480
13481 SDValue Src0 = N->getOperand(0);
13482 SDValue Src1 = N->getOperand(1);
13483 SDValue Src2 = N->getOperand(2);
13484
13485 if (isClampZeroToOne(Src0, Src1)) {
13486 // const_a, const_b, x -> clamp is safe in all cases including signaling
13487 // nans.
13488 // FIXME: Should this be allowing -0.0?
13489 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13490 }
13491
13492 const MachineFunction &MF = DAG.getMachineFunction();
13494
13495 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13496 // handling no dx10-clamp?
13497 if (Info->getMode().DX10Clamp) {
13498 // If NaNs is clamped to 0, we are free to reorder the inputs.
13499
13500 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13501 std::swap(Src0, Src1);
13502
13503 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13504 std::swap(Src1, Src2);
13505
13506 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13507 std::swap(Src0, Src1);
13508
13509 if (isClampZeroToOne(Src1, Src2))
13510 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13511 }
13512
13513 return SDValue();
13514}
13515
13516SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13517 DAGCombinerInfo &DCI) const {
13518 SDValue Src0 = N->getOperand(0);
13519 SDValue Src1 = N->getOperand(1);
13520 if (Src0.isUndef() && Src1.isUndef())
13521 return DCI.DAG.getUNDEF(N->getValueType(0));
13522 return SDValue();
13523}
13524
13525// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13526// expanded into a set of cmp/select instructions.
13528 unsigned NumElem,
13529 bool IsDivergentIdx,
13530 const GCNSubtarget *Subtarget) {
13532 return false;
13533
13534 unsigned VecSize = EltSize * NumElem;
13535
13536 // Sub-dword vectors of size 2 dword or less have better implementation.
13537 if (VecSize <= 64 && EltSize < 32)
13538 return false;
13539
13540 // Always expand the rest of sub-dword instructions, otherwise it will be
13541 // lowered via memory.
13542 if (EltSize < 32)
13543 return true;
13544
13545 // Always do this if var-idx is divergent, otherwise it will become a loop.
13546 if (IsDivergentIdx)
13547 return true;
13548
13549 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13550 unsigned NumInsts = NumElem /* Number of compares */ +
13551 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13552
13553 // On some architectures (GFX9) movrel is not available and it's better
13554 // to expand.
13555 if (Subtarget->useVGPRIndexMode())
13556 return NumInsts <= 16;
13557
13558 // If movrel is available, use it instead of expanding for vector of 8
13559 // elements.
13560 if (Subtarget->hasMovrel())
13561 return NumInsts <= 15;
13562
13563 return true;
13564}
13565
13567 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13568 if (isa<ConstantSDNode>(Idx))
13569 return false;
13570
13571 SDValue Vec = N->getOperand(0);
13572 EVT VecVT = Vec.getValueType();
13573 EVT EltVT = VecVT.getVectorElementType();
13574 unsigned EltSize = EltVT.getSizeInBits();
13575 unsigned NumElem = VecVT.getVectorNumElements();
13576
13578 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13579}
13580
13581SDValue
13582SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13583 DAGCombinerInfo &DCI) const {
13584 SDValue Vec = N->getOperand(0);
13585 SelectionDAG &DAG = DCI.DAG;
13586
13587 EVT VecVT = Vec.getValueType();
13588 EVT VecEltVT = VecVT.getVectorElementType();
13589 EVT ResVT = N->getValueType(0);
13590
13591 unsigned VecSize = VecVT.getSizeInBits();
13592 unsigned VecEltSize = VecEltVT.getSizeInBits();
13593
13594 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13596 SDLoc SL(N);
13597 SDValue Idx = N->getOperand(1);
13598 SDValue Elt =
13599 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13600 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13601 }
13602
13603 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13604 // =>
13605 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13606 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13607 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13608 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13609 SDLoc SL(N);
13610 SDValue Idx = N->getOperand(1);
13611 unsigned Opc = Vec.getOpcode();
13612
13613 switch (Opc) {
13614 default:
13615 break;
13616 // TODO: Support other binary operations.
13617 case ISD::FADD:
13618 case ISD::FSUB:
13619 case ISD::FMUL:
13620 case ISD::ADD:
13621 case ISD::UMIN:
13622 case ISD::UMAX:
13623 case ISD::SMIN:
13624 case ISD::SMAX:
13625 case ISD::FMAXNUM:
13626 case ISD::FMINNUM:
13627 case ISD::FMAXNUM_IEEE:
13628 case ISD::FMINNUM_IEEE:
13629 case ISD::FMAXIMUM:
13630 case ISD::FMINIMUM: {
13631 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13632 Vec.getOperand(0), Idx);
13633 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13634 Vec.getOperand(1), Idx);
13635
13636 DCI.AddToWorklist(Elt0.getNode());
13637 DCI.AddToWorklist(Elt1.getNode());
13638 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13639 }
13640 }
13641 }
13642
13643 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13645 SDLoc SL(N);
13646 SDValue Idx = N->getOperand(1);
13647 SDValue V;
13648 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13649 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13650 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13651 if (I == 0)
13652 V = Elt;
13653 else
13654 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13655 }
13656 return V;
13657 }
13658
13659 if (!DCI.isBeforeLegalize())
13660 return SDValue();
13661
13662 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13663 // elements. This exposes more load reduction opportunities by replacing
13664 // multiple small extract_vector_elements with a single 32-bit extract.
13665 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13666 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13667 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13668 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13669
13670 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13671 unsigned EltIdx = BitIndex / 32;
13672 unsigned LeftoverBitIdx = BitIndex % 32;
13673 SDLoc SL(N);
13674
13675 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13676 DCI.AddToWorklist(Cast.getNode());
13677
13678 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13679 DAG.getConstant(EltIdx, SL, MVT::i32));
13680 DCI.AddToWorklist(Elt.getNode());
13681 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13682 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13683 DCI.AddToWorklist(Srl.getNode());
13684
13685 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13686 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13687 DCI.AddToWorklist(Trunc.getNode());
13688
13689 if (VecEltVT == ResVT) {
13690 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13691 }
13692
13693 assert(ResVT.isScalarInteger());
13694 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13695 }
13696
13697 return SDValue();
13698}
13699
13700SDValue
13701SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13702 DAGCombinerInfo &DCI) const {
13703 SDValue Vec = N->getOperand(0);
13704 SDValue Idx = N->getOperand(2);
13705 EVT VecVT = Vec.getValueType();
13706 EVT EltVT = VecVT.getVectorElementType();
13707
13708 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13709 // => BUILD_VECTOR n x select (e, const-idx)
13711 return SDValue();
13712
13713 SelectionDAG &DAG = DCI.DAG;
13714 SDLoc SL(N);
13715 SDValue Ins = N->getOperand(1);
13716 EVT IdxVT = Idx.getValueType();
13717
13719 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13720 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13721 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13722 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13723 Ops.push_back(V);
13724 }
13725
13726 return DAG.getBuildVector(VecVT, SL, Ops);
13727}
13728
13729/// Return the source of an fp_extend from f16 to f32, or a converted FP
13730/// constant.
13732 if (Src.getOpcode() == ISD::FP_EXTEND &&
13733 Src.getOperand(0).getValueType() == MVT::f16) {
13734 return Src.getOperand(0);
13735 }
13736
13737 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13738 APFloat Val = CFP->getValueAPF();
13739 bool LosesInfo = true;
13741 if (!LosesInfo)
13742 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13743 }
13744
13745 return SDValue();
13746}
13747
13748SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13749 DAGCombinerInfo &DCI) const {
13750 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13751 "combine only useful on gfx8");
13752
13753 SDValue TruncSrc = N->getOperand(0);
13754 EVT VT = N->getValueType(0);
13755 if (VT != MVT::f16)
13756 return SDValue();
13757
13758 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13759 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13760 return SDValue();
13761
13762 SelectionDAG &DAG = DCI.DAG;
13763 SDLoc SL(N);
13764
13765 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13766 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13767 // casting back.
13768
13769 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13770 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13771 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13772 if (!A)
13773 return SDValue();
13774
13775 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13776 if (!B)
13777 return SDValue();
13778
13779 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13780 if (!C)
13781 return SDValue();
13782
13783 // This changes signaling nan behavior. If an input is a signaling nan, it
13784 // would have been quieted by the fpext originally. We don't care because
13785 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13786 // we would be worse off than just doing the promotion.
13787 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13788 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13789 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13790 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13791}
13792
13793unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13794 const SDNode *N0,
13795 const SDNode *N1) const {
13796 EVT VT = N0->getValueType(0);
13797
13798 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13799 // support denormals ever.
13800 if (((VT == MVT::f32 &&
13802 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13805 return ISD::FMAD;
13806
13807 const TargetOptions &Options = DAG.getTarget().Options;
13808 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13809 (N0->getFlags().hasAllowContract() &&
13810 N1->getFlags().hasAllowContract())) &&
13812 return ISD::FMA;
13813 }
13814
13815 return 0;
13816}
13817
13818// For a reassociatable opcode perform:
13819// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13820SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13821 SelectionDAG &DAG) const {
13822 EVT VT = N->getValueType(0);
13823 if (VT != MVT::i32 && VT != MVT::i64)
13824 return SDValue();
13825
13826 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13827 return SDValue();
13828
13829 unsigned Opc = N->getOpcode();
13830 SDValue Op0 = N->getOperand(0);
13831 SDValue Op1 = N->getOperand(1);
13832
13833 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13834 return SDValue();
13835
13836 if (Op0->isDivergent())
13837 std::swap(Op0, Op1);
13838
13839 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13840 return SDValue();
13841
13842 SDValue Op2 = Op1.getOperand(1);
13843 Op1 = Op1.getOperand(0);
13844 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13845 return SDValue();
13846
13847 if (Op1->isDivergent())
13848 std::swap(Op1, Op2);
13849
13850 SDLoc SL(N);
13851 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13852 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13853}
13854
13855static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13856 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13858 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13859 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13860 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13861}
13862
13863// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13864// multiplies, if any.
13865//
13866// Full 64-bit multiplies that feed into an addition are lowered here instead
13867// of using the generic expansion. The generic expansion ends up with
13868// a tree of ADD nodes that prevents us from using the "add" part of the
13869// MAD instruction. The expansion produced here results in a chain of ADDs
13870// instead of a tree.
13871SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13872 DAGCombinerInfo &DCI) const {
13873 assert(N->getOpcode() == ISD::ADD);
13874
13875 SelectionDAG &DAG = DCI.DAG;
13876 EVT VT = N->getValueType(0);
13877 SDLoc SL(N);
13878 SDValue LHS = N->getOperand(0);
13879 SDValue RHS = N->getOperand(1);
13880
13881 if (VT.isVector())
13882 return SDValue();
13883
13884 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13885 // result in scalar registers for uniform values.
13886 if (!N->isDivergent() && Subtarget->hasSMulHi())
13887 return SDValue();
13888
13889 unsigned NumBits = VT.getScalarSizeInBits();
13890 if (NumBits <= 32 || NumBits > 64)
13891 return SDValue();
13892
13893 if (LHS.getOpcode() != ISD::MUL) {
13894 assert(RHS.getOpcode() == ISD::MUL);
13895 std::swap(LHS, RHS);
13896 }
13897
13898 // Avoid the fold if it would unduly increase the number of multiplies due to
13899 // multiple uses, except on hardware with full-rate multiply-add (which is
13900 // part of full-rate 64-bit ops).
13901 if (!Subtarget->hasFullRate64Ops()) {
13902 unsigned NumUsers = 0;
13903 for (SDNode *User : LHS->users()) {
13904 // There is a use that does not feed into addition, so the multiply can't
13905 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13906 if (User->getOpcode() != ISD::ADD)
13907 return SDValue();
13908
13909 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13910 // MUL + 3xADD + 3xADDC over 3xMAD.
13911 ++NumUsers;
13912 if (NumUsers >= 3)
13913 return SDValue();
13914 }
13915 }
13916
13917 SDValue MulLHS = LHS.getOperand(0);
13918 SDValue MulRHS = LHS.getOperand(1);
13919 SDValue AddRHS = RHS;
13920
13921 // Always check whether operands are small unsigned values, since that
13922 // knowledge is useful in more cases. Check for small signed values only if
13923 // doing so can unlock a shorter code sequence.
13924 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13925 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13926
13927 bool MulSignedLo = false;
13928 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13929 MulSignedLo =
13930 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
13931 }
13932
13933 // The operands and final result all have the same number of bits. If
13934 // operands need to be extended, they can be extended with garbage. The
13935 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13936 // truncated away in the end.
13937 if (VT != MVT::i64) {
13938 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13939 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13940 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13941 }
13942
13943 // The basic code generated is conceptually straightforward. Pseudo code:
13944 //
13945 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13946 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13947 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13948 //
13949 // The second and third lines are optional, depending on whether the factors
13950 // are {sign,zero}-extended or not.
13951 //
13952 // The actual DAG is noisier than the pseudo code, but only due to
13953 // instructions that disassemble values into low and high parts, and
13954 // assemble the final result.
13955 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13956
13957 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13958 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13959 SDValue Accum =
13960 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13961
13962 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13963 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13964
13965 if (!MulLHSUnsigned32) {
13966 auto MulLHSHi =
13967 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13968 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13969 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13970 }
13971
13972 if (!MulRHSUnsigned32) {
13973 auto MulRHSHi =
13974 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13975 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13976 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13977 }
13978
13979 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13980 Accum = DAG.getBitcast(MVT::i64, Accum);
13981 }
13982
13983 if (VT != MVT::i64)
13984 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13985 return Accum;
13986}
13987
13988// Collect the ultimate src of each of the mul node's operands, and confirm
13989// each operand is 8 bytes.
13990static std::optional<ByteProvider<SDValue>>
13991handleMulOperand(const SDValue &MulOperand) {
13992 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13993 if (!Byte0 || Byte0->isConstantZero()) {
13994 return std::nullopt;
13995 }
13996 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13997 if (Byte1 && !Byte1->isConstantZero()) {
13998 return std::nullopt;
13999 }
14000 return Byte0;
14001}
14002
14003static unsigned addPermMasks(unsigned First, unsigned Second) {
14004 unsigned FirstCs = First & 0x0c0c0c0c;
14005 unsigned SecondCs = Second & 0x0c0c0c0c;
14006 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14007 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14008
14009 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14010 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14011 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14012 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14013
14014 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14015}
14016
14017struct DotSrc {
14019 int64_t PermMask;
14021};
14022
14026 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14027
14028 assert(Src0.Src.has_value() && Src1.Src.has_value());
14029 // Src0s and Src1s are empty, just place arbitrarily.
14030 if (Step == 0) {
14031 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14032 Src0.SrcOffset / 4});
14033 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14034 Src1.SrcOffset / 4});
14035 return;
14036 }
14037
14038 for (int BPI = 0; BPI < 2; BPI++) {
14039 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14040 if (BPI == 1) {
14041 BPP = {Src1, Src0};
14042 }
14043 unsigned ZeroMask = 0x0c0c0c0c;
14044 unsigned FMask = 0xFF << (8 * (3 - Step));
14045
14046 unsigned FirstMask =
14047 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14048 unsigned SecondMask =
14049 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14050 // Attempt to find Src vector which contains our SDValue, if so, add our
14051 // perm mask to the existing one. If we are unable to find a match for the
14052 // first SDValue, attempt to find match for the second.
14053 int FirstGroup = -1;
14054 for (int I = 0; I < 2; I++) {
14055 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14056 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14057 return IterElt.SrcOp == *BPP.first.Src &&
14058 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14059 };
14060
14061 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14062 if (Match != Srcs.end()) {
14063 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14064 FirstGroup = I;
14065 break;
14066 }
14067 }
14068 if (FirstGroup != -1) {
14069 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14070 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14071 return IterElt.SrcOp == *BPP.second.Src &&
14072 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14073 };
14074 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14075 if (Match != Srcs.end()) {
14076 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14077 } else
14078 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14079 return;
14080 }
14081 }
14082
14083 // If we have made it here, then we could not find a match in Src0s or Src1s
14084 // for either Src0 or Src1, so just place them arbitrarily.
14085
14086 unsigned ZeroMask = 0x0c0c0c0c;
14087 unsigned FMask = 0xFF << (8 * (3 - Step));
14088
14089 Src0s.push_back(
14090 {*Src0.Src,
14091 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14092 Src0.SrcOffset / 4});
14093 Src1s.push_back(
14094 {*Src1.Src,
14095 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14096 Src1.SrcOffset / 4});
14097}
14098
14100 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14101 bool IsAny) {
14102
14103 // If we just have one source, just permute it accordingly.
14104 if (Srcs.size() == 1) {
14105 auto *Elt = Srcs.begin();
14106 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14107
14108 // v_perm will produce the original value
14109 if (Elt->PermMask == 0x3020100)
14110 return EltOp;
14111
14112 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14113 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14114 }
14115
14116 auto *FirstElt = Srcs.begin();
14117 auto *SecondElt = std::next(FirstElt);
14118
14120
14121 // If we have multiple sources in the chain, combine them via perms (using
14122 // calculated perm mask) and Ors.
14123 while (true) {
14124 auto FirstMask = FirstElt->PermMask;
14125 auto SecondMask = SecondElt->PermMask;
14126
14127 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14128 unsigned FirstPlusFour = FirstMask | 0x04040404;
14129 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14130 // original 0x0C.
14131 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14132
14133 auto PermMask = addPermMasks(FirstMask, SecondMask);
14134 auto FirstVal =
14135 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14136 auto SecondVal =
14137 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14138
14139 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14140 SecondVal,
14141 DAG.getConstant(PermMask, SL, MVT::i32)));
14142
14143 FirstElt = std::next(SecondElt);
14144 if (FirstElt == Srcs.end())
14145 break;
14146
14147 SecondElt = std::next(FirstElt);
14148 // If we only have a FirstElt, then just combine that into the cumulative
14149 // source node.
14150 if (SecondElt == Srcs.end()) {
14151 auto EltOp =
14152 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14153
14154 Perms.push_back(
14155 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14156 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14157 break;
14158 }
14159 }
14160
14161 assert(Perms.size() == 1 || Perms.size() == 2);
14162 return Perms.size() == 2
14163 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14164 : Perms[0];
14165}
14166
14167static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14168 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14169 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14170 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14171 EntryMask += ZeroMask;
14172 }
14173}
14174
14175static bool isMul(const SDValue Op) {
14176 auto Opcode = Op.getOpcode();
14177
14178 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14179 Opcode == AMDGPUISD::MUL_I24);
14180}
14181
14182static std::optional<bool>
14184 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14185 const SDValue &S1Op, const SelectionDAG &DAG) {
14186 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14187 // of the dot4 is irrelevant.
14188 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14189 return false;
14190
14191 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14192 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14193 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14194 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14195 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14196 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14197
14198 assert(!(S0IsUnsigned && S0IsSigned));
14199 assert(!(S1IsUnsigned && S1IsSigned));
14200
14201 // There are 9 possible permutations of
14202 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14203
14204 // In two permutations, the sign bits are known to be the same for both Ops,
14205 // so simply return Signed / Unsigned corresponding to the MSB
14206
14207 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14208 return S0IsSigned;
14209
14210 // In another two permutations, the sign bits are known to be opposite. In
14211 // this case return std::nullopt to indicate a bad match.
14212
14213 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14214 return std::nullopt;
14215
14216 // In the remaining five permutations, we don't know the value of the sign
14217 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14218 // the upper bits must be extension bits. Thus, the only ways for the sign
14219 // bit to be unknown is if it was sign extended from unknown value, or if it
14220 // was any extended. In either case, it is correct to use the signed
14221 // version of the signedness semantics of dot4
14222
14223 // In two of such permutations, we known the sign bit is set for
14224 // one op, and the other is unknown. It is okay to used signed version of
14225 // dot4.
14226 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14227 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14228 return true;
14229
14230 // In one such permutation, we don't know either of the sign bits. It is okay
14231 // to used the signed version of dot4.
14232 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14233 return true;
14234
14235 // In two of such permutations, we known the sign bit is unset for
14236 // one op, and the other is unknown. Return std::nullopt to indicate a
14237 // bad match.
14238 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14239 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14240 return std::nullopt;
14241
14242 llvm_unreachable("Fully covered condition");
14243}
14244
14245SDValue SITargetLowering::performAddCombine(SDNode *N,
14246 DAGCombinerInfo &DCI) const {
14247 SelectionDAG &DAG = DCI.DAG;
14248 EVT VT = N->getValueType(0);
14249 SDLoc SL(N);
14250 SDValue LHS = N->getOperand(0);
14251 SDValue RHS = N->getOperand(1);
14252
14253 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14254 if (Subtarget->hasMad64_32()) {
14255 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14256 return Folded;
14257 }
14258 }
14259
14260 if (SDValue V = reassociateScalarOps(N, DAG)) {
14261 return V;
14262 }
14263
14264 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14265 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14266 SDValue TempNode(N, 0);
14267 std::optional<bool> IsSigned;
14271
14272 // Match the v_dot4 tree, while collecting src nodes.
14273 int ChainLength = 0;
14274 for (int I = 0; I < 4; I++) {
14275 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14276 if (MulIdx == -1)
14277 break;
14278 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14279 if (!Src0)
14280 break;
14281 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14282 if (!Src1)
14283 break;
14284
14285 auto IterIsSigned = checkDot4MulSignedness(
14286 TempNode->getOperand(MulIdx), *Src0, *Src1,
14287 TempNode->getOperand(MulIdx)->getOperand(0),
14288 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14289 if (!IterIsSigned)
14290 break;
14291 if (!IsSigned)
14292 IsSigned = *IterIsSigned;
14293 if (*IterIsSigned != *IsSigned)
14294 break;
14295 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14296 auto AddIdx = 1 - MulIdx;
14297 // Allow the special case where add (add (mul24, 0), mul24) became ->
14298 // add (mul24, mul24).
14299 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14300 Src2s.push_back(TempNode->getOperand(AddIdx));
14301 auto Src0 =
14302 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14303 if (!Src0)
14304 break;
14305 auto Src1 =
14306 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14307 if (!Src1)
14308 break;
14309 auto IterIsSigned = checkDot4MulSignedness(
14310 TempNode->getOperand(AddIdx), *Src0, *Src1,
14311 TempNode->getOperand(AddIdx)->getOperand(0),
14312 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14313 if (!IterIsSigned)
14314 break;
14315 assert(IsSigned);
14316 if (*IterIsSigned != *IsSigned)
14317 break;
14318 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14319 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14320 ChainLength = I + 2;
14321 break;
14322 }
14323
14324 TempNode = TempNode->getOperand(AddIdx);
14325 Src2s.push_back(TempNode);
14326 ChainLength = I + 1;
14327 if (TempNode->getNumOperands() < 2)
14328 break;
14329 LHS = TempNode->getOperand(0);
14330 RHS = TempNode->getOperand(1);
14331 }
14332
14333 if (ChainLength < 2)
14334 return SDValue();
14335
14336 // Masks were constructed with assumption that we would find a chain of
14337 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14338 // 0x0c) so they do not affect dot calculation.
14339 if (ChainLength < 4) {
14340 fixMasks(Src0s, ChainLength);
14341 fixMasks(Src1s, ChainLength);
14342 }
14343
14344 SDValue Src0, Src1;
14345
14346 // If we are just using a single source for both, and have permuted the
14347 // bytes consistently, we can just use the sources without permuting
14348 // (commutation).
14349 bool UseOriginalSrc = false;
14350 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14351 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14352 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14353 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14354 SmallVector<unsigned, 4> SrcBytes;
14355 auto Src0Mask = Src0s.begin()->PermMask;
14356 SrcBytes.push_back(Src0Mask & 0xFF000000);
14357 bool UniqueEntries = true;
14358 for (auto I = 1; I < 4; I++) {
14359 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14360
14361 if (is_contained(SrcBytes, NextByte)) {
14362 UniqueEntries = false;
14363 break;
14364 }
14365 SrcBytes.push_back(NextByte);
14366 }
14367
14368 if (UniqueEntries) {
14369 UseOriginalSrc = true;
14370
14371 auto *FirstElt = Src0s.begin();
14372 auto FirstEltOp =
14373 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14374
14375 auto *SecondElt = Src1s.begin();
14376 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14377 SecondElt->DWordOffset);
14378
14379 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14380 MVT::getIntegerVT(32));
14381 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14382 MVT::getIntegerVT(32));
14383 }
14384 }
14385
14386 if (!UseOriginalSrc) {
14387 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14388 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14389 }
14390
14391 assert(IsSigned);
14392 SDValue Src2 =
14393 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14394
14395 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14396 : Intrinsic::amdgcn_udot4,
14397 SL, MVT::i64);
14398
14399 assert(!VT.isVector());
14400 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14401 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14402
14403 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14404 }
14405
14406 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14407 return SDValue();
14408
14409 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14410 // add x, sext (setcc) => usubo_carry x, 0, setcc
14411 unsigned Opc = LHS.getOpcode();
14412 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14413 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14414 std::swap(RHS, LHS);
14415
14416 Opc = RHS.getOpcode();
14417 switch (Opc) {
14418 default:
14419 break;
14420 case ISD::ZERO_EXTEND:
14421 case ISD::SIGN_EXTEND:
14422 case ISD::ANY_EXTEND: {
14423 auto Cond = RHS.getOperand(0);
14424 // If this won't be a real VOPC output, we would still need to insert an
14425 // extra instruction anyway.
14426 if (!isBoolSGPR(Cond))
14427 break;
14428 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14429 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14431 return DAG.getNode(Opc, SL, VTList, Args);
14432 }
14433 case ISD::UADDO_CARRY: {
14434 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14435 if (!isNullConstant(RHS.getOperand(1)))
14436 break;
14437 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14438 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14439 }
14440 }
14441 return SDValue();
14442}
14443
14444SDValue SITargetLowering::performSubCombine(SDNode *N,
14445 DAGCombinerInfo &DCI) const {
14446 SelectionDAG &DAG = DCI.DAG;
14447 EVT VT = N->getValueType(0);
14448
14449 if (VT != MVT::i32)
14450 return SDValue();
14451
14452 SDLoc SL(N);
14453 SDValue LHS = N->getOperand(0);
14454 SDValue RHS = N->getOperand(1);
14455
14456 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14457 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14458 unsigned Opc = RHS.getOpcode();
14459 switch (Opc) {
14460 default:
14461 break;
14462 case ISD::ZERO_EXTEND:
14463 case ISD::SIGN_EXTEND:
14464 case ISD::ANY_EXTEND: {
14465 auto Cond = RHS.getOperand(0);
14466 // If this won't be a real VOPC output, we would still need to insert an
14467 // extra instruction anyway.
14468 if (!isBoolSGPR(Cond))
14469 break;
14470 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14471 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14473 return DAG.getNode(Opc, SL, VTList, Args);
14474 }
14475 }
14476
14477 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14478 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14479 if (!isNullConstant(LHS.getOperand(1)))
14480 return SDValue();
14481 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14482 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14483 }
14484 return SDValue();
14485}
14486
14487SDValue
14488SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14489 DAGCombinerInfo &DCI) const {
14490
14491 if (N->getValueType(0) != MVT::i32)
14492 return SDValue();
14493
14494 if (!isNullConstant(N->getOperand(1)))
14495 return SDValue();
14496
14497 SelectionDAG &DAG = DCI.DAG;
14498 SDValue LHS = N->getOperand(0);
14499
14500 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14501 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14502 unsigned LHSOpc = LHS.getOpcode();
14503 unsigned Opc = N->getOpcode();
14504 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14505 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14506 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14507 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14508 }
14509 return SDValue();
14510}
14511
14512SDValue SITargetLowering::performFAddCombine(SDNode *N,
14513 DAGCombinerInfo &DCI) const {
14514 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14515 return SDValue();
14516
14517 SelectionDAG &DAG = DCI.DAG;
14518 EVT VT = N->getValueType(0);
14519
14520 SDLoc SL(N);
14521 SDValue LHS = N->getOperand(0);
14522 SDValue RHS = N->getOperand(1);
14523
14524 // These should really be instruction patterns, but writing patterns with
14525 // source modifiers is a pain.
14526
14527 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14528 if (LHS.getOpcode() == ISD::FADD) {
14529 SDValue A = LHS.getOperand(0);
14530 if (A == LHS.getOperand(1)) {
14531 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14532 if (FusedOp != 0) {
14533 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14534 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14535 }
14536 }
14537 }
14538
14539 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14540 if (RHS.getOpcode() == ISD::FADD) {
14541 SDValue A = RHS.getOperand(0);
14542 if (A == RHS.getOperand(1)) {
14543 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14544 if (FusedOp != 0) {
14545 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14546 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14547 }
14548 }
14549 }
14550
14551 return SDValue();
14552}
14553
14554SDValue SITargetLowering::performFSubCombine(SDNode *N,
14555 DAGCombinerInfo &DCI) const {
14556 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14557 return SDValue();
14558
14559 SelectionDAG &DAG = DCI.DAG;
14560 SDLoc SL(N);
14561 EVT VT = N->getValueType(0);
14562 assert(!VT.isVector());
14563
14564 // Try to get the fneg to fold into the source modifier. This undoes generic
14565 // DAG combines and folds them into the mad.
14566 //
14567 // Only do this if we are not trying to support denormals. v_mad_f32 does
14568 // not support denormals ever.
14569 SDValue LHS = N->getOperand(0);
14570 SDValue RHS = N->getOperand(1);
14571 if (LHS.getOpcode() == ISD::FADD) {
14572 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14573 SDValue A = LHS.getOperand(0);
14574 if (A == LHS.getOperand(1)) {
14575 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14576 if (FusedOp != 0) {
14577 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14578 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14579
14580 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14581 }
14582 }
14583 }
14584
14585 if (RHS.getOpcode() == ISD::FADD) {
14586 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14587
14588 SDValue A = RHS.getOperand(0);
14589 if (A == RHS.getOperand(1)) {
14590 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14591 if (FusedOp != 0) {
14592 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14593 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14594 }
14595 }
14596 }
14597
14598 return SDValue();
14599}
14600
14601SDValue SITargetLowering::performFDivCombine(SDNode *N,
14602 DAGCombinerInfo &DCI) const {
14603 SelectionDAG &DAG = DCI.DAG;
14604 SDLoc SL(N);
14605 EVT VT = N->getValueType(0);
14606 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14607 return SDValue();
14608
14609 SDValue LHS = N->getOperand(0);
14610 SDValue RHS = N->getOperand(1);
14611
14612 SDNodeFlags Flags = N->getFlags();
14613 SDNodeFlags RHSFlags = RHS->getFlags();
14614 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14615 !RHS->hasOneUse())
14616 return SDValue();
14617
14618 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14619 bool IsNegative = false;
14620 if (CLHS->isExactlyValue(1.0) ||
14621 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14622 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14623 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14624 if (RHS.getOpcode() == ISD::FSQRT) {
14625 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14626 SDValue Rsq =
14627 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14628 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14629 }
14630 }
14631 }
14632
14633 return SDValue();
14634}
14635
14636SDValue SITargetLowering::performFMulCombine(SDNode *N,
14637 DAGCombinerInfo &DCI) const {
14638 SelectionDAG &DAG = DCI.DAG;
14639 EVT VT = N->getValueType(0);
14640 EVT ScalarVT = VT.getScalarType();
14641 EVT IntVT = VT.changeElementType(MVT::i32);
14642
14643 SDValue LHS = N->getOperand(0);
14644 SDValue RHS = N->getOperand(1);
14645
14646 // It is cheaper to realize i32 inline constants as compared against
14647 // materializing f16 or f64 (or even non-inline f32) values,
14648 // possible via ldexp usage, as shown below :
14649 //
14650 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14651 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14652 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14653 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14654 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14655 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14656 if (!TrueNode)
14657 return SDValue();
14658 const ConstantFPSDNode *FalseNode =
14659 isConstOrConstSplatFP(RHS.getOperand(2));
14660 if (!FalseNode)
14661 return SDValue();
14662
14663 if (TrueNode->isNegative() != FalseNode->isNegative())
14664 return SDValue();
14665
14666 // For f32, only non-inline constants should be transformed.
14668 if (ScalarVT == MVT::f32 &&
14669 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14670 TII->isInlineConstant(FalseNode->getValueAPF()))
14671 return SDValue();
14672
14673 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14674 if (TrueNodeExpVal == INT_MIN)
14675 return SDValue();
14676 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14677 if (FalseNodeExpVal == INT_MIN)
14678 return SDValue();
14679
14680 SDLoc SL(N);
14681 SDValue SelectNode =
14682 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14683 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14684 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14685
14686 LHS = TrueNode->isNegative()
14687 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14688 : LHS;
14689
14690 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14691 }
14692
14693 return SDValue();
14694}
14695
14696SDValue SITargetLowering::performFMACombine(SDNode *N,
14697 DAGCombinerInfo &DCI) const {
14698 SelectionDAG &DAG = DCI.DAG;
14699 EVT VT = N->getValueType(0);
14700 SDLoc SL(N);
14701
14702 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14703 return SDValue();
14704
14705 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14706 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14707 SDValue Op1 = N->getOperand(0);
14708 SDValue Op2 = N->getOperand(1);
14709 SDValue FMA = N->getOperand(2);
14710
14711 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14712 Op2.getOpcode() != ISD::FP_EXTEND)
14713 return SDValue();
14714
14715 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14716 // regardless of the denorm mode setting. Therefore,
14717 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14718 const TargetOptions &Options = DAG.getTarget().Options;
14719 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14720 (N->getFlags().hasAllowContract() &&
14721 FMA->getFlags().hasAllowContract())) {
14722 Op1 = Op1.getOperand(0);
14723 Op2 = Op2.getOperand(0);
14724 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14726 return SDValue();
14727
14728 SDValue Vec1 = Op1.getOperand(0);
14729 SDValue Idx1 = Op1.getOperand(1);
14730 SDValue Vec2 = Op2.getOperand(0);
14731
14732 SDValue FMAOp1 = FMA.getOperand(0);
14733 SDValue FMAOp2 = FMA.getOperand(1);
14734 SDValue FMAAcc = FMA.getOperand(2);
14735
14736 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14737 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14738 return SDValue();
14739
14740 FMAOp1 = FMAOp1.getOperand(0);
14741 FMAOp2 = FMAOp2.getOperand(0);
14742 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14744 return SDValue();
14745
14746 SDValue Vec3 = FMAOp1.getOperand(0);
14747 SDValue Vec4 = FMAOp2.getOperand(0);
14748 SDValue Idx2 = FMAOp1.getOperand(1);
14749
14750 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14751 // Idx1 and Idx2 cannot be the same.
14752 Idx1 == Idx2)
14753 return SDValue();
14754
14755 if (Vec1 == Vec2 || Vec3 == Vec4)
14756 return SDValue();
14757
14758 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14759 return SDValue();
14760
14761 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14762 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14763 DAG.getTargetConstant(0, SL, MVT::i1));
14764 }
14765 }
14766 return SDValue();
14767}
14768
14769SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14770 DAGCombinerInfo &DCI) const {
14771 SelectionDAG &DAG = DCI.DAG;
14772 SDLoc SL(N);
14773
14774 SDValue LHS = N->getOperand(0);
14775 SDValue RHS = N->getOperand(1);
14776 EVT VT = LHS.getValueType();
14777 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14778
14779 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14780 if (!CRHS) {
14781 CRHS = dyn_cast<ConstantSDNode>(LHS);
14782 if (CRHS) {
14783 std::swap(LHS, RHS);
14785 }
14786 }
14787
14788 if (CRHS) {
14789 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14790 isBoolSGPR(LHS.getOperand(0))) {
14791 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14792 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14793 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14794 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14795 if ((CRHS->isAllOnes() &&
14796 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14797 (CRHS->isZero() &&
14798 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14799 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14800 DAG.getAllOnesConstant(SL, MVT::i1));
14801 if ((CRHS->isAllOnes() &&
14802 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14803 (CRHS->isZero() &&
14804 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14805 return LHS.getOperand(0);
14806 }
14807
14808 const APInt &CRHSVal = CRHS->getAPIntValue();
14809 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14810 LHS.getOpcode() == ISD::SELECT &&
14811 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14812 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14813 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14814 isBoolSGPR(LHS.getOperand(0))) {
14815 // Given CT != FT:
14816 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14817 // setcc (select cc, CT, CF), CF, ne => cc
14818 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14819 // setcc (select cc, CT, CF), CT, eq => cc
14820 const APInt &CT = LHS.getConstantOperandAPInt(1);
14821 const APInt &CF = LHS.getConstantOperandAPInt(2);
14822
14823 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14824 (CT == CRHSVal && CC == ISD::SETNE))
14825 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14826 DAG.getAllOnesConstant(SL, MVT::i1));
14827 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14828 (CT == CRHSVal && CC == ISD::SETEQ))
14829 return LHS.getOperand(0);
14830 }
14831 }
14832
14833 if (VT != MVT::f32 && VT != MVT::f64 &&
14834 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14835 return SDValue();
14836
14837 // Match isinf/isfinite pattern
14838 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14839 // (fcmp one (fabs x), inf) -> (fp_class x,
14840 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14841 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
14842 LHS.getOpcode() == ISD::FABS) {
14843 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14844 if (!CRHS)
14845 return SDValue();
14846
14847 const APFloat &APF = CRHS->getValueAPF();
14848 if (APF.isInfinity() && !APF.isNegative()) {
14849 const unsigned IsInfMask =
14851 const unsigned IsFiniteMask =
14855 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14856 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14857 DAG.getConstant(Mask, SL, MVT::i32));
14858 }
14859 }
14860
14861 return SDValue();
14862}
14863
14864SDValue
14865SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14866 DAGCombinerInfo &DCI) const {
14867 SelectionDAG &DAG = DCI.DAG;
14868 SDLoc SL(N);
14869 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14870
14871 SDValue Src = N->getOperand(0);
14872 SDValue Shift = N->getOperand(0);
14873
14874 // TODO: Extend type shouldn't matter (assuming legal types).
14875 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14876 Shift = Shift.getOperand(0);
14877
14878 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14879 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14880 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14881 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14882 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14883 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14884 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14885 SDValue Shifted = DAG.getZExtOrTrunc(
14886 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
14887
14888 unsigned ShiftOffset = 8 * Offset;
14889 if (Shift.getOpcode() == ISD::SHL)
14890 ShiftOffset -= C->getZExtValue();
14891 else
14892 ShiftOffset += C->getZExtValue();
14893
14894 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14895 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14896 MVT::f32, Shifted);
14897 }
14898 }
14899 }
14900
14901 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14902 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14903 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14904 // We simplified Src. If this node is not dead, visit it again so it is
14905 // folded properly.
14906 if (N->getOpcode() != ISD::DELETED_NODE)
14907 DCI.AddToWorklist(N);
14908 return SDValue(N, 0);
14909 }
14910
14911 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14912 if (SDValue DemandedSrc =
14914 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14915
14916 return SDValue();
14917}
14918
14919SDValue SITargetLowering::performClampCombine(SDNode *N,
14920 DAGCombinerInfo &DCI) const {
14921 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14922 if (!CSrc)
14923 return SDValue();
14924
14925 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14926 const APFloat &F = CSrc->getValueAPF();
14927 APFloat Zero = APFloat::getZero(F.getSemantics());
14928 if (F < Zero ||
14929 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14930 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14931 }
14932
14933 APFloat One(F.getSemantics(), "1.0");
14934 if (F > One)
14935 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14936
14937 return SDValue(CSrc, 0);
14938}
14939
14941 DAGCombinerInfo &DCI) const {
14942 switch (N->getOpcode()) {
14943 case ISD::ADD:
14944 case ISD::SUB:
14945 case ISD::SHL:
14946 case ISD::SRL:
14947 case ISD::SRA:
14948 case ISD::AND:
14949 case ISD::OR:
14950 case ISD::XOR:
14951 case ISD::MUL:
14952 case ISD::SETCC:
14953 case ISD::SELECT:
14954 case ISD::SMIN:
14955 case ISD::SMAX:
14956 case ISD::UMIN:
14957 case ISD::UMAX:
14958 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
14959 return Res;
14960 break;
14961 default:
14962 break;
14963 }
14964
14965 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14966 return SDValue();
14967
14968 switch (N->getOpcode()) {
14969 case ISD::ADD:
14970 return performAddCombine(N, DCI);
14971 case ISD::SUB:
14972 return performSubCombine(N, DCI);
14973 case ISD::UADDO_CARRY:
14974 case ISD::USUBO_CARRY:
14975 return performAddCarrySubCarryCombine(N, DCI);
14976 case ISD::FADD:
14977 return performFAddCombine(N, DCI);
14978 case ISD::FSUB:
14979 return performFSubCombine(N, DCI);
14980 case ISD::FDIV:
14981 return performFDivCombine(N, DCI);
14982 case ISD::FMUL:
14983 return performFMulCombine(N, DCI);
14984 case ISD::SETCC:
14985 return performSetCCCombine(N, DCI);
14986 case ISD::FMAXNUM:
14987 case ISD::FMINNUM:
14988 case ISD::FMAXNUM_IEEE:
14989 case ISD::FMINNUM_IEEE:
14990 case ISD::FMAXIMUM:
14991 case ISD::FMINIMUM:
14992 case ISD::SMAX:
14993 case ISD::SMIN:
14994 case ISD::UMAX:
14995 case ISD::UMIN:
14998 return performMinMaxCombine(N, DCI);
14999 case ISD::FMA:
15000 return performFMACombine(N, DCI);
15001 case ISD::AND:
15002 return performAndCombine(N, DCI);
15003 case ISD::OR:
15004 return performOrCombine(N, DCI);
15005 case ISD::FSHR: {
15007 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15008 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15009 return matchPERM(N, DCI);
15010 }
15011 break;
15012 }
15013 case ISD::XOR:
15014 return performXorCombine(N, DCI);
15015 case ISD::ZERO_EXTEND:
15016 return performZeroExtendCombine(N, DCI);
15018 return performSignExtendInRegCombine(N, DCI);
15020 return performClassCombine(N, DCI);
15021 case ISD::FCANONICALIZE:
15022 return performFCanonicalizeCombine(N, DCI);
15023 case AMDGPUISD::RCP:
15024 return performRcpCombine(N, DCI);
15025 case ISD::FLDEXP:
15026 case AMDGPUISD::FRACT:
15027 case AMDGPUISD::RSQ:
15030 case AMDGPUISD::RSQ_CLAMP: {
15031 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15032 SDValue Src = N->getOperand(0);
15033 if (Src.isUndef())
15034 return Src;
15035 break;
15036 }
15037 case ISD::SINT_TO_FP:
15038 case ISD::UINT_TO_FP:
15039 return performUCharToFloatCombine(N, DCI);
15040 case ISD::FCOPYSIGN:
15041 return performFCopySignCombine(N, DCI);
15046 return performCvtF32UByteNCombine(N, DCI);
15047 case AMDGPUISD::FMED3:
15048 return performFMed3Combine(N, DCI);
15050 return performCvtPkRTZCombine(N, DCI);
15051 case AMDGPUISD::CLAMP:
15052 return performClampCombine(N, DCI);
15053 case ISD::SCALAR_TO_VECTOR: {
15054 SelectionDAG &DAG = DCI.DAG;
15055 EVT VT = N->getValueType(0);
15056
15057 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15058 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15059 SDLoc SL(N);
15060 SDValue Src = N->getOperand(0);
15061 EVT EltVT = Src.getValueType();
15062 if (EltVT != MVT::i16)
15063 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15064
15065 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15066 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15067 }
15068
15069 break;
15070 }
15072 return performExtractVectorEltCombine(N, DCI);
15074 return performInsertVectorEltCombine(N, DCI);
15075 case ISD::FP_ROUND:
15076 return performFPRoundCombine(N, DCI);
15077 case ISD::LOAD: {
15078 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15079 return Widened;
15080 [[fallthrough]];
15081 }
15082 default: {
15083 if (!DCI.isBeforeLegalize()) {
15084 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15085 return performMemSDNodeCombine(MemNode, DCI);
15086 }
15087
15088 break;
15089 }
15090 }
15091
15093}
15094
15095/// Helper function for adjustWritemask
15096static unsigned SubIdx2Lane(unsigned Idx) {
15097 switch (Idx) {
15098 default:
15099 return ~0u;
15100 case AMDGPU::sub0:
15101 return 0;
15102 case AMDGPU::sub1:
15103 return 1;
15104 case AMDGPU::sub2:
15105 return 2;
15106 case AMDGPU::sub3:
15107 return 3;
15108 case AMDGPU::sub4:
15109 return 4; // Possible with TFE/LWE
15110 }
15111}
15112
15113/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15114SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15115 SelectionDAG &DAG) const {
15116 unsigned Opcode = Node->getMachineOpcode();
15117
15118 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15119 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15120 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15121 return Node; // not implemented for D16
15122
15123 SDNode *Users[5] = {nullptr};
15124 unsigned Lane = 0;
15125 unsigned DmaskIdx =
15126 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15127 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15128 unsigned NewDmask = 0;
15129 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15130 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15131 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15132 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15133 ? true
15134 : false;
15135 unsigned TFCLane = 0;
15136 bool HasChain = Node->getNumValues() > 1;
15137
15138 if (OldDmask == 0) {
15139 // These are folded out, but on the chance it happens don't assert.
15140 return Node;
15141 }
15142
15143 unsigned OldBitsSet = llvm::popcount(OldDmask);
15144 // Work out which is the TFE/LWE lane if that is enabled.
15145 if (UsesTFC) {
15146 TFCLane = OldBitsSet;
15147 }
15148
15149 // Try to figure out the used register components
15150 for (SDUse &Use : Node->uses()) {
15151
15152 // Don't look at users of the chain.
15153 if (Use.getResNo() != 0)
15154 continue;
15155
15156 SDNode *User = Use.getUser();
15157
15158 // Abort if we can't understand the usage
15159 if (!User->isMachineOpcode() ||
15160 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15161 return Node;
15162
15163 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15164 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15165 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15166 // set, etc.
15167 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15168 if (Lane == ~0u)
15169 return Node;
15170
15171 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15172 if (UsesTFC && Lane == TFCLane) {
15173 Users[Lane] = User;
15174 } else {
15175 // Set which texture component corresponds to the lane.
15176 unsigned Comp;
15177 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15178 Comp = llvm::countr_zero(Dmask);
15179 Dmask &= ~(1 << Comp);
15180 }
15181
15182 // Abort if we have more than one user per component.
15183 if (Users[Lane])
15184 return Node;
15185
15186 Users[Lane] = User;
15187 NewDmask |= 1 << Comp;
15188 }
15189 }
15190
15191 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15192 bool NoChannels = !NewDmask;
15193 if (NoChannels) {
15194 if (!UsesTFC) {
15195 // No uses of the result and not using TFC. Then do nothing.
15196 return Node;
15197 }
15198 // If the original dmask has one channel - then nothing to do
15199 if (OldBitsSet == 1)
15200 return Node;
15201 // Use an arbitrary dmask - required for the instruction to work
15202 NewDmask = 1;
15203 }
15204 // Abort if there's no change
15205 if (NewDmask == OldDmask)
15206 return Node;
15207
15208 unsigned BitsSet = llvm::popcount(NewDmask);
15209
15210 // Check for TFE or LWE - increase the number of channels by one to account
15211 // for the extra return value
15212 // This will need adjustment for D16 if this is also included in
15213 // adjustWriteMask (this function) but at present D16 are excluded.
15214 unsigned NewChannels = BitsSet + UsesTFC;
15215
15216 int NewOpcode =
15217 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15218 assert(NewOpcode != -1 &&
15219 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15220 "failed to find equivalent MIMG op");
15221
15222 // Adjust the writemask in the node
15224 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15225 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15226 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15227
15228 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15229
15230 MVT ResultVT = NewChannels == 1
15231 ? SVT
15232 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15233 : NewChannels == 5 ? 8
15234 : NewChannels);
15235 SDVTList NewVTList =
15236 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15237
15238 MachineSDNode *NewNode =
15239 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15240
15241 if (HasChain) {
15242 // Update chain.
15243 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15244 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15245 }
15246
15247 if (NewChannels == 1) {
15248 assert(Node->hasNUsesOfValue(1, 0));
15249 SDNode *Copy =
15250 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15251 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15252 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15253 return nullptr;
15254 }
15255
15256 // Update the users of the node with the new indices
15257 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15258 SDNode *User = Users[i];
15259 if (!User) {
15260 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15261 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15262 if (i || !NoChannels)
15263 continue;
15264 } else {
15265 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15266 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15267 if (NewUser != User) {
15268 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15269 DAG.RemoveDeadNode(User);
15270 }
15271 }
15272
15273 switch (Idx) {
15274 default:
15275 break;
15276 case AMDGPU::sub0:
15277 Idx = AMDGPU::sub1;
15278 break;
15279 case AMDGPU::sub1:
15280 Idx = AMDGPU::sub2;
15281 break;
15282 case AMDGPU::sub2:
15283 Idx = AMDGPU::sub3;
15284 break;
15285 case AMDGPU::sub3:
15286 Idx = AMDGPU::sub4;
15287 break;
15288 }
15289 }
15290
15291 DAG.RemoveDeadNode(Node);
15292 return nullptr;
15293}
15294
15296 if (Op.getOpcode() == ISD::AssertZext)
15297 Op = Op.getOperand(0);
15298
15299 return isa<FrameIndexSDNode>(Op);
15300}
15301
15302/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15303/// with frame index operands.
15304/// LLVM assumes that inputs are to these instructions are registers.
15305SDNode *
15307 SelectionDAG &DAG) const {
15308 if (Node->getOpcode() == ISD::CopyToReg) {
15309 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15310 SDValue SrcVal = Node->getOperand(2);
15311
15312 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15313 // to try understanding copies to physical registers.
15314 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15315 SDLoc SL(Node);
15317 SDValue VReg = DAG.getRegister(
15318 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15319
15320 SDNode *Glued = Node->getGluedNode();
15321 SDValue ToVReg = DAG.getCopyToReg(
15322 Node->getOperand(0), SL, VReg, SrcVal,
15323 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15324 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15325 VReg, ToVReg.getValue(1));
15326 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15327 DAG.RemoveDeadNode(Node);
15328 return ToResultReg.getNode();
15329 }
15330 }
15331
15333 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15334 if (!isFrameIndexOp(Node->getOperand(i))) {
15335 Ops.push_back(Node->getOperand(i));
15336 continue;
15337 }
15338
15339 SDLoc DL(Node);
15340 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15341 Node->getOperand(i).getValueType(),
15342 Node->getOperand(i)),
15343 0));
15344 }
15345
15346 return DAG.UpdateNodeOperands(Node, Ops);
15347}
15348
15349/// Fold the instructions after selecting them.
15350/// Returns null if users were already updated.
15352 SelectionDAG &DAG) const {
15354 unsigned Opcode = Node->getMachineOpcode();
15355
15356 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15357 !TII->isGather4(Opcode) &&
15358 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15359 return adjustWritemask(Node, DAG);
15360 }
15361
15362 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15364 return Node;
15365 }
15366
15367 switch (Opcode) {
15368 case AMDGPU::V_DIV_SCALE_F32_e64:
15369 case AMDGPU::V_DIV_SCALE_F64_e64: {
15370 // Satisfy the operand register constraint when one of the inputs is
15371 // undefined. Ordinarily each undef value will have its own implicit_def of
15372 // a vreg, so force these to use a single register.
15373 SDValue Src0 = Node->getOperand(1);
15374 SDValue Src1 = Node->getOperand(3);
15375 SDValue Src2 = Node->getOperand(5);
15376
15377 if ((Src0.isMachineOpcode() &&
15378 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15379 (Src0 == Src1 || Src0 == Src2))
15380 break;
15381
15382 MVT VT = Src0.getValueType().getSimpleVT();
15383 const TargetRegisterClass *RC =
15384 getRegClassFor(VT, Src0.getNode()->isDivergent());
15385
15387 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15388
15389 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15390 Src0, SDValue());
15391
15392 // src0 must be the same register as src1 or src2, even if the value is
15393 // undefined, so make sure we don't violate this constraint.
15394 if (Src0.isMachineOpcode() &&
15395 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15396 if (Src1.isMachineOpcode() &&
15397 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15398 Src0 = Src1;
15399 else if (Src2.isMachineOpcode() &&
15400 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15401 Src0 = Src2;
15402 else {
15403 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15404 Src0 = UndefReg;
15405 Src1 = UndefReg;
15406 }
15407 } else
15408 break;
15409
15410 SmallVector<SDValue, 9> Ops(Node->ops());
15411 Ops[1] = Src0;
15412 Ops[3] = Src1;
15413 Ops[5] = Src2;
15414 Ops.push_back(ImpDef.getValue(1));
15415 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15416 }
15417 default:
15418 break;
15419 }
15420
15421 return Node;
15422}
15423
15424// Any MIMG instructions that use tfe or lwe require an initialization of the
15425// result register that will be written in the case of a memory access failure.
15426// The required code is also added to tie this init code to the result of the
15427// img instruction.
15430 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15431 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15432 MachineBasicBlock &MBB = *MI.getParent();
15433
15434 int DstIdx =
15435 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15436 unsigned InitIdx = 0;
15437
15438 if (TII->isImage(MI)) {
15439 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15440 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15441 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15442
15443 if (!TFE && !LWE) // intersect_ray
15444 return;
15445
15446 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15447 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15448 unsigned D16Val = D16 ? D16->getImm() : 0;
15449
15450 if (!TFEVal && !LWEVal)
15451 return;
15452
15453 // At least one of TFE or LWE are non-zero
15454 // We have to insert a suitable initialization of the result value and
15455 // tie this to the dest of the image instruction.
15456
15457 // Calculate which dword we have to initialize to 0.
15458 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15459
15460 // check that dmask operand is found.
15461 assert(MO_Dmask && "Expected dmask operand in instruction");
15462
15463 unsigned dmask = MO_Dmask->getImm();
15464 // Determine the number of active lanes taking into account the
15465 // Gather4 special case
15466 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15467
15468 bool Packed = !Subtarget->hasUnpackedD16VMem();
15469
15470 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15471
15472 // Abandon attempt if the dst size isn't large enough
15473 // - this is in fact an error but this is picked up elsewhere and
15474 // reported correctly.
15475 uint32_t DstSize =
15476 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15477 if (DstSize < InitIdx)
15478 return;
15479 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15480 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15481 } else {
15482 return;
15483 }
15484
15485 const DebugLoc &DL = MI.getDebugLoc();
15486
15487 // Create a register for the initialization value.
15488 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15489 unsigned NewDst = 0; // Final initialized value will be in here
15490
15491 // If PRTStrictNull feature is enabled (the default) then initialize
15492 // all the result registers to 0, otherwise just the error indication
15493 // register (VGPRn+1)
15494 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15495 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15496
15497 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15498 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15499 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15500 // Initialize dword
15501 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15502 // clang-format off
15503 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15504 .addImm(0);
15505 // clang-format on
15506 // Insert into the super-reg
15507 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15508 .addReg(PrevDst)
15509 .addReg(SubReg)
15511
15512 PrevDst = NewDst;
15513 }
15514
15515 // Add as an implicit operand
15516 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15517
15518 // Tie the just added implicit operand to the dst
15519 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15520}
15521
15522/// Assign the register class depending on the number of
15523/// bits set in the writemask
15525 SDNode *Node) const {
15527
15528 MachineFunction *MF = MI.getParent()->getParent();
15531
15532 if (TII->isVOP3(MI.getOpcode())) {
15533 // Make sure constant bus requirements are respected.
15534 TII->legalizeOperandsVOP3(MRI, MI);
15535
15536 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15537 // This saves a chain-copy of registers and better balance register
15538 // use between vgpr and agpr as agpr tuples tend to be big.
15539 if (!MI.getDesc().operands().empty()) {
15540 unsigned Opc = MI.getOpcode();
15541 bool HasAGPRs = Info->mayNeedAGPRs();
15542 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15543 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15544 for (auto I :
15545 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15546 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15547 if (I == -1)
15548 break;
15549 if ((I == Src2Idx) && (HasAGPRs))
15550 break;
15551 MachineOperand &Op = MI.getOperand(I);
15552 if (!Op.isReg() || !Op.getReg().isVirtual())
15553 continue;
15554 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15555 if (!TRI->hasAGPRs(RC))
15556 continue;
15557 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15558 if (!Src || !Src->isCopy() ||
15559 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15560 continue;
15561 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15562 // All uses of agpr64 and agpr32 can also accept vgpr except for
15563 // v_accvgpr_read, but we do not produce agpr reads during selection,
15564 // so no use checks are needed.
15565 MRI.setRegClass(Op.getReg(), NewRC);
15566 }
15567
15568 if (TII->isMAI(MI)) {
15569 // The ordinary src0, src1, src2 were legalized above.
15570 //
15571 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15572 // as a separate instruction.
15573 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15574 AMDGPU::OpName::scale_src0);
15575 if (Src0Idx != -1) {
15576 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15577 AMDGPU::OpName::scale_src1);
15578 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15579 TII->usesConstantBus(MRI, MI, Src1Idx))
15580 TII->legalizeOpWithMove(MI, Src1Idx);
15581 }
15582 }
15583
15584 if (!HasAGPRs)
15585 return;
15586
15587 // Resolve the rest of AV operands to AGPRs.
15588 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15589 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15590 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15591 if (TRI->isVectorSuperClass(RC)) {
15592 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15593 MRI.setRegClass(Src2->getReg(), NewRC);
15594 if (Src2->isTied())
15595 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15596 }
15597 }
15598 }
15599 }
15600
15601 return;
15602 }
15603
15604 if (TII->isImage(MI))
15605 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15606}
15607
15609 uint64_t Val) {
15610 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15611 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15612}
15613
15615 const SDLoc &DL,
15616 SDValue Ptr) const {
15618
15619 // Build the half of the subregister with the constants before building the
15620 // full 128-bit register. If we are building multiple resource descriptors,
15621 // this will allow CSEing of the 2-component register.
15622 const SDValue Ops0[] = {
15623 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15624 buildSMovImm32(DAG, DL, 0),
15625 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15626 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15627 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15628
15629 SDValue SubRegHi = SDValue(
15630 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15631
15632 // Combine the constants and the pointer.
15633 const SDValue Ops1[] = {
15634 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15635 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15636 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15637
15638 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15639}
15640
15641/// Return a resource descriptor with the 'Add TID' bit enabled
15642/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15643/// of the resource descriptor) to create an offset, which is added to
15644/// the resource pointer.
15646 SDValue Ptr, uint32_t RsrcDword1,
15647 uint64_t RsrcDword2And3) const {
15648 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15649 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15650 if (RsrcDword1) {
15651 PtrHi =
15652 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15653 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15654 0);
15655 }
15656
15657 SDValue DataLo =
15658 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15659 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15660
15661 const SDValue Ops[] = {
15662 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15663 PtrLo,
15664 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15665 PtrHi,
15666 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15667 DataLo,
15668 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15669 DataHi,
15670 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15671
15672 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15673}
15674
15675//===----------------------------------------------------------------------===//
15676// SI Inline Assembly Support
15677//===----------------------------------------------------------------------===//
15678
15679std::pair<unsigned, const TargetRegisterClass *>
15681 StringRef Constraint,
15682 MVT VT) const {
15683 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15684
15685 const TargetRegisterClass *RC = nullptr;
15686 if (Constraint.size() == 1) {
15687 const unsigned BitWidth = VT.getSizeInBits();
15688 switch (Constraint[0]) {
15689 default:
15690 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15691 case 's':
15692 case 'r':
15693 switch (BitWidth) {
15694 case 16:
15695 RC = &AMDGPU::SReg_32RegClass;
15696 break;
15697 case 64:
15698 RC = &AMDGPU::SGPR_64RegClass;
15699 break;
15700 default:
15702 if (!RC)
15703 return std::pair(0U, nullptr);
15704 break;
15705 }
15706 break;
15707 case 'v':
15708 switch (BitWidth) {
15709 case 16:
15710 RC = &AMDGPU::VGPR_32RegClass;
15711 break;
15712 default:
15713 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15714 if (!RC)
15715 return std::pair(0U, nullptr);
15716 break;
15717 }
15718 break;
15719 case 'a':
15720 if (!Subtarget->hasMAIInsts())
15721 break;
15722 switch (BitWidth) {
15723 case 16:
15724 RC = &AMDGPU::AGPR_32RegClass;
15725 break;
15726 default:
15727 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15728 if (!RC)
15729 return std::pair(0U, nullptr);
15730 break;
15731 }
15732 break;
15733 }
15734 // We actually support i128, i16 and f16 as inline parameters
15735 // even if they are not reported as legal
15736 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15737 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15738 return std::pair(0U, RC);
15739 }
15740
15741 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15742 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15743 if (RegName.consume_front("v")) {
15744 RC = &AMDGPU::VGPR_32RegClass;
15745 } else if (RegName.consume_front("s")) {
15746 RC = &AMDGPU::SGPR_32RegClass;
15747 } else if (RegName.consume_front("a")) {
15748 RC = &AMDGPU::AGPR_32RegClass;
15749 }
15750
15751 if (RC) {
15752 uint32_t Idx;
15753 if (RegName.consume_front("[")) {
15754 uint32_t End;
15755 bool Failed = RegName.consumeInteger(10, Idx);
15756 Failed |= !RegName.consume_front(":");
15757 Failed |= RegName.consumeInteger(10, End);
15758 Failed |= !RegName.consume_back("]");
15759 if (!Failed) {
15760 uint32_t Width = (End - Idx + 1) * 32;
15761 // Prohibit constraints for register ranges with a width that does not
15762 // match the required type.
15763 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15764 return std::pair(0U, nullptr);
15765 MCRegister Reg = RC->getRegister(Idx);
15767 RC = TRI->getVGPRClassForBitWidth(Width);
15768 else if (SIRegisterInfo::isSGPRClass(RC))
15769 RC = TRI->getSGPRClassForBitWidth(Width);
15770 else if (SIRegisterInfo::isAGPRClass(RC))
15771 RC = TRI->getAGPRClassForBitWidth(Width);
15772 if (RC) {
15773 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15774 return std::pair(Reg, RC);
15775 }
15776 }
15777 } else {
15778 // Check for lossy scalar/vector conversions.
15779 if (VT.isVector() && VT.getSizeInBits() != 32)
15780 return std::pair(0U, nullptr);
15781 bool Failed = RegName.getAsInteger(10, Idx);
15782 if (!Failed && Idx < RC->getNumRegs())
15783 return std::pair(RC->getRegister(Idx), RC);
15784 }
15785 }
15786 }
15787
15788 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15789 if (Ret.first)
15790 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15791
15792 return Ret;
15793}
15794
15795static bool isImmConstraint(StringRef Constraint) {
15796 if (Constraint.size() == 1) {
15797 switch (Constraint[0]) {
15798 default:
15799 break;
15800 case 'I':
15801 case 'J':
15802 case 'A':
15803 case 'B':
15804 case 'C':
15805 return true;
15806 }
15807 } else if (Constraint == "DA" || Constraint == "DB") {
15808 return true;
15809 }
15810 return false;
15811}
15812
15815 if (Constraint.size() == 1) {
15816 switch (Constraint[0]) {
15817 default:
15818 break;
15819 case 's':
15820 case 'v':
15821 case 'a':
15822 return C_RegisterClass;
15823 }
15824 }
15825 if (isImmConstraint(Constraint)) {
15826 return C_Other;
15827 }
15828 return TargetLowering::getConstraintType(Constraint);
15829}
15830
15831static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15833 Val = Val & maskTrailingOnes<uint64_t>(Size);
15834 }
15835 return Val;
15836}
15837
15839 StringRef Constraint,
15840 std::vector<SDValue> &Ops,
15841 SelectionDAG &DAG) const {
15842 if (isImmConstraint(Constraint)) {
15843 uint64_t Val;
15844 if (getAsmOperandConstVal(Op, Val) &&
15845 checkAsmConstraintVal(Op, Constraint, Val)) {
15846 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15847 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15848 }
15849 } else {
15850 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15851 }
15852}
15853
15855 unsigned Size = Op.getScalarValueSizeInBits();
15856 if (Size > 64)
15857 return false;
15858
15859 if (Size == 16 && !Subtarget->has16BitInsts())
15860 return false;
15861
15862 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15863 Val = C->getSExtValue();
15864 return true;
15865 }
15866 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15867 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15868 return true;
15869 }
15870 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15871 if (Size != 16 || Op.getNumOperands() != 2)
15872 return false;
15873 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15874 return false;
15875 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15876 Val = C->getSExtValue();
15877 return true;
15878 }
15879 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15880 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15881 return true;
15882 }
15883 }
15884
15885 return false;
15886}
15887
15889 uint64_t Val) const {
15890 if (Constraint.size() == 1) {
15891 switch (Constraint[0]) {
15892 case 'I':
15894 case 'J':
15895 return isInt<16>(Val);
15896 case 'A':
15897 return checkAsmConstraintValA(Op, Val);
15898 case 'B':
15899 return isInt<32>(Val);
15900 case 'C':
15901 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15903 default:
15904 break;
15905 }
15906 } else if (Constraint.size() == 2) {
15907 if (Constraint == "DA") {
15908 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15909 int64_t LoBits = static_cast<int32_t>(Val);
15910 return checkAsmConstraintValA(Op, HiBits, 32) &&
15911 checkAsmConstraintValA(Op, LoBits, 32);
15912 }
15913 if (Constraint == "DB") {
15914 return true;
15915 }
15916 }
15917 llvm_unreachable("Invalid asm constraint");
15918}
15919
15921 unsigned MaxSize) const {
15922 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15923 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15924 if (Size == 16) {
15925 MVT VT = Op.getSimpleValueType();
15926 switch (VT.SimpleTy) {
15927 default:
15928 return false;
15929 case MVT::i16:
15930 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15931 case MVT::f16:
15932 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15933 case MVT::bf16:
15934 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15935 case MVT::v2i16:
15936 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15937 case MVT::v2f16:
15938 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15939 case MVT::v2bf16:
15940 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15941 }
15942 }
15943 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15944 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15945 return true;
15946 return false;
15947}
15948
15949static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15950 switch (UnalignedClassID) {
15951 case AMDGPU::VReg_64RegClassID:
15952 return AMDGPU::VReg_64_Align2RegClassID;
15953 case AMDGPU::VReg_96RegClassID:
15954 return AMDGPU::VReg_96_Align2RegClassID;
15955 case AMDGPU::VReg_128RegClassID:
15956 return AMDGPU::VReg_128_Align2RegClassID;
15957 case AMDGPU::VReg_160RegClassID:
15958 return AMDGPU::VReg_160_Align2RegClassID;
15959 case AMDGPU::VReg_192RegClassID:
15960 return AMDGPU::VReg_192_Align2RegClassID;
15961 case AMDGPU::VReg_224RegClassID:
15962 return AMDGPU::VReg_224_Align2RegClassID;
15963 case AMDGPU::VReg_256RegClassID:
15964 return AMDGPU::VReg_256_Align2RegClassID;
15965 case AMDGPU::VReg_288RegClassID:
15966 return AMDGPU::VReg_288_Align2RegClassID;
15967 case AMDGPU::VReg_320RegClassID:
15968 return AMDGPU::VReg_320_Align2RegClassID;
15969 case AMDGPU::VReg_352RegClassID:
15970 return AMDGPU::VReg_352_Align2RegClassID;
15971 case AMDGPU::VReg_384RegClassID:
15972 return AMDGPU::VReg_384_Align2RegClassID;
15973 case AMDGPU::VReg_512RegClassID:
15974 return AMDGPU::VReg_512_Align2RegClassID;
15975 case AMDGPU::VReg_1024RegClassID:
15976 return AMDGPU::VReg_1024_Align2RegClassID;
15977 case AMDGPU::AReg_64RegClassID:
15978 return AMDGPU::AReg_64_Align2RegClassID;
15979 case AMDGPU::AReg_96RegClassID:
15980 return AMDGPU::AReg_96_Align2RegClassID;
15981 case AMDGPU::AReg_128RegClassID:
15982 return AMDGPU::AReg_128_Align2RegClassID;
15983 case AMDGPU::AReg_160RegClassID:
15984 return AMDGPU::AReg_160_Align2RegClassID;
15985 case AMDGPU::AReg_192RegClassID:
15986 return AMDGPU::AReg_192_Align2RegClassID;
15987 case AMDGPU::AReg_256RegClassID:
15988 return AMDGPU::AReg_256_Align2RegClassID;
15989 case AMDGPU::AReg_512RegClassID:
15990 return AMDGPU::AReg_512_Align2RegClassID;
15991 case AMDGPU::AReg_1024RegClassID:
15992 return AMDGPU::AReg_1024_Align2RegClassID;
15993 default:
15994 return -1;
15995 }
15996}
15997
15998// Figure out which registers should be reserved for stack access. Only after
15999// the function is legalized do we know all of the non-spill stack objects or if
16000// calls are present.
16004 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16005 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16006 const SIInstrInfo *TII = ST.getInstrInfo();
16007
16008 if (Info->isEntryFunction()) {
16009 // Callable functions have fixed registers used for stack access.
16011 }
16012
16013 // TODO: Move this logic to getReservedRegs()
16014 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16015 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16016 Register SReg = ST.isWave32()
16017 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16018 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16019 &AMDGPU::SGPR_64RegClass);
16020 Info->setSGPRForEXECCopy(SReg);
16021
16022 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16023 Info->getStackPtrOffsetReg()));
16024 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16025 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16026
16027 // We need to worry about replacing the default register with itself in case
16028 // of MIR testcases missing the MFI.
16029 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16030 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16031
16032 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16033 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16034
16035 Info->limitOccupancy(MF);
16036
16037 if (ST.isWave32() && !MF.empty()) {
16038 for (auto &MBB : MF) {
16039 for (auto &MI : MBB) {
16040 TII->fixImplicitOperands(MI);
16041 }
16042 }
16043 }
16044
16045 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16046 // classes if required. Ideally the register class constraints would differ
16047 // per-subtarget, but there's no easy way to achieve that right now. This is
16048 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16049 // from using them as the register class for legal types.
16050 if (ST.needsAlignedVGPRs()) {
16051 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16052 const Register Reg = Register::index2VirtReg(I);
16053 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16054 if (!RC)
16055 continue;
16056 int NewClassID = getAlignedAGPRClassID(RC->getID());
16057 if (NewClassID != -1)
16058 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16059 }
16060 }
16061
16063}
16064
16066 KnownBits &Known,
16067 const APInt &DemandedElts,
16068 const SelectionDAG &DAG,
16069 unsigned Depth) const {
16070 Known.resetAll();
16071 unsigned Opc = Op.getOpcode();
16072 switch (Opc) {
16074 unsigned IID = Op.getConstantOperandVal(0);
16075 switch (IID) {
16076 case Intrinsic::amdgcn_mbcnt_lo:
16077 case Intrinsic::amdgcn_mbcnt_hi: {
16078 const GCNSubtarget &ST =
16080 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16081 // most 31 + src1.
16082 Known.Zero.setBitsFrom(
16083 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16084 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16085 Known = KnownBits::add(Known, Known2);
16086 return;
16087 }
16088 }
16089 break;
16090 }
16091 }
16093 Op, Known, DemandedElts, DAG, Depth);
16094}
16095
16097 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16099
16100 // Set the high bits to zero based on the maximum allowed scratch size per
16101 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16102 // calculation won't overflow, so assume the sign bit is never set.
16103 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16104}
16105
16107 KnownBits &Known, unsigned Dim) {
16108 unsigned MaxValue =
16109 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16110 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16111}
16112
16114 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16115 const MachineRegisterInfo &MRI, unsigned Depth) const {
16116 const MachineInstr *MI = MRI.getVRegDef(R);
16117 switch (MI->getOpcode()) {
16118 case AMDGPU::G_INTRINSIC:
16119 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16120 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16121 switch (IID) {
16122 case Intrinsic::amdgcn_workitem_id_x:
16123 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16124 break;
16125 case Intrinsic::amdgcn_workitem_id_y:
16126 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16127 break;
16128 case Intrinsic::amdgcn_workitem_id_z:
16129 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16130 break;
16131 case Intrinsic::amdgcn_mbcnt_lo:
16132 case Intrinsic::amdgcn_mbcnt_hi: {
16133 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16134 // most 31 + src1.
16135 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16136 ? getSubtarget()->getWavefrontSizeLog2()
16137 : 5);
16138 KnownBits Known2;
16139 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16140 Depth + 1);
16141 Known = KnownBits::add(Known, Known2);
16142 break;
16143 }
16144 case Intrinsic::amdgcn_groupstaticsize: {
16145 // We can report everything over the maximum size as 0. We can't report
16146 // based on the actual size because we don't know if it's accurate or not
16147 // at any given point.
16148 Known.Zero.setHighBits(
16149 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16150 break;
16151 }
16152 }
16153 break;
16154 }
16155 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16156 Known.Zero.setHighBits(24);
16157 break;
16158 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16159 Known.Zero.setHighBits(16);
16160 break;
16161 case AMDGPU::G_AMDGPU_SMED3:
16162 case AMDGPU::G_AMDGPU_UMED3: {
16163 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16164
16165 KnownBits Known2;
16166 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16167 if (Known2.isUnknown())
16168 break;
16169
16170 KnownBits Known1;
16171 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16172 if (Known1.isUnknown())
16173 break;
16174
16175 KnownBits Known0;
16176 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16177 if (Known0.isUnknown())
16178 break;
16179
16180 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16181 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16182 Known.One = Known0.One & Known1.One & Known2.One;
16183 break;
16184 }
16185 }
16186}
16187
16190 unsigned Depth) const {
16191 const MachineInstr *MI = MRI.getVRegDef(R);
16192 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16193 // FIXME: Can this move to generic code? What about the case where the call
16194 // site specifies a lower alignment?
16195 Intrinsic::ID IID = GI->getIntrinsicID();
16197 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16198 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16199 return *RetAlign;
16200 }
16201 return Align(1);
16202}
16203
16206 const Align CacheLineAlign = Align(64);
16207
16208 // Pre-GFX10 target did not benefit from loop alignment
16209 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16210 getSubtarget()->hasInstFwdPrefetchBug())
16211 return PrefAlign;
16212
16213 // On GFX10 I$ is 4 x 64 bytes cache lines.
16214 // By default prefetcher keeps one cache line behind and reads two ahead.
16215 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16216 // behind and one ahead.
16217 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16218 // If loop fits 64 bytes it always spans no more than two cache lines and
16219 // does not need an alignment.
16220 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16221 // Else if loop is less or equal 192 bytes we need two lines behind.
16222
16224 const MachineBasicBlock *Header = ML->getHeader();
16225 if (Header->getAlignment() != PrefAlign)
16226 return Header->getAlignment(); // Already processed.
16227
16228 unsigned LoopSize = 0;
16229 for (const MachineBasicBlock *MBB : ML->blocks()) {
16230 // If inner loop block is aligned assume in average half of the alignment
16231 // size to be added as nops.
16232 if (MBB != Header)
16233 LoopSize += MBB->getAlignment().value() / 2;
16234
16235 for (const MachineInstr &MI : *MBB) {
16236 LoopSize += TII->getInstSizeInBytes(MI);
16237 if (LoopSize > 192)
16238 return PrefAlign;
16239 }
16240 }
16241
16242 if (LoopSize <= 64)
16243 return PrefAlign;
16244
16245 if (LoopSize <= 128)
16246 return CacheLineAlign;
16247
16248 // If any of parent loops is surrounded by prefetch instructions do not
16249 // insert new for inner loop, which would reset parent's settings.
16250 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16251 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16252 auto I = Exit->getFirstNonDebugInstr();
16253 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16254 return CacheLineAlign;
16255 }
16256 }
16257
16258 MachineBasicBlock *Pre = ML->getLoopPreheader();
16259 MachineBasicBlock *Exit = ML->getExitBlock();
16260
16261 if (Pre && Exit) {
16262 auto PreTerm = Pre->getFirstTerminator();
16263 if (PreTerm == Pre->begin() ||
16264 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16265 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16266 .addImm(1); // prefetch 2 lines behind PC
16267
16268 auto ExitHead = Exit->getFirstNonDebugInstr();
16269 if (ExitHead == Exit->end() ||
16270 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16271 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16272 .addImm(2); // prefetch 1 line behind PC
16273 }
16274
16275 return CacheLineAlign;
16276}
16277
16279static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16280 assert(N->getOpcode() == ISD::CopyFromReg);
16281 do {
16282 // Follow the chain until we find an INLINEASM node.
16283 N = N->getOperand(0).getNode();
16284 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16285 return true;
16286 } while (N->getOpcode() == ISD::CopyFromReg);
16287 return false;
16288}
16289
16292 UniformityInfo *UA) const {
16293 switch (N->getOpcode()) {
16294 case ISD::CopyFromReg: {
16295 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16296 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16297 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16298 Register Reg = R->getReg();
16299
16300 // FIXME: Why does this need to consider isLiveIn?
16301 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16302 return !TRI->isSGPRReg(MRI, Reg);
16303
16304 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16305 return UA->isDivergent(V);
16306
16307 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16308 return !TRI->isSGPRReg(MRI, Reg);
16309 }
16310 case ISD::LOAD: {
16311 const LoadSDNode *L = cast<LoadSDNode>(N);
16312 unsigned AS = L->getAddressSpace();
16313 // A flat load may access private memory.
16315 }
16316 case ISD::CALLSEQ_END:
16317 return true;
16319 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16321 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16340 // Target-specific read-modify-write atomics are sources of divergence.
16341 return true;
16342 default:
16343 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16344 // Generic read-modify-write atomics are sources of divergence.
16345 return A->readMem() && A->writeMem();
16346 }
16347 return false;
16348 }
16349}
16350
16352 EVT VT) const {
16353 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16354 case MVT::f32:
16356 case MVT::f64:
16357 case MVT::f16:
16359 default:
16360 return false;
16361 }
16362}
16363
16365 LLT Ty, const MachineFunction &MF) const {
16366 switch (Ty.getScalarSizeInBits()) {
16367 case 32:
16368 return !denormalModeIsFlushAllF32(MF);
16369 case 64:
16370 case 16:
16371 return !denormalModeIsFlushAllF64F16(MF);
16372 default:
16373 return false;
16374 }
16375}
16376
16378 const SelectionDAG &DAG,
16379 bool SNaN,
16380 unsigned Depth) const {
16381 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16382 const MachineFunction &MF = DAG.getMachineFunction();
16384
16385 if (Info->getMode().DX10Clamp)
16386 return true; // Clamped to 0.
16387 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16388 }
16389
16391 Depth);
16392}
16393
16394// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16395// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16397 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16398 return true;
16399
16401 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16402 if (DenormMode == DenormalMode::getPreserveSign())
16403 return true;
16404
16405 // TODO: Remove this.
16406 return RMW->getFunction()
16407 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16408 .getValueAsBool();
16409}
16410
16412 LLVMContext &Ctx = RMW->getContext();
16413 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16414 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16415
16416 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16417 << "Hardware instruction generated for atomic "
16418 << RMW->getOperationName(RMW->getOperation())
16419 << " operation at memory scope " << MemScope;
16420}
16421
16422static bool isV2F16OrV2BF16(Type *Ty) {
16423 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16424 Type *EltTy = VT->getElementType();
16425 return VT->getNumElements() == 2 &&
16426 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16427 }
16428
16429 return false;
16430}
16431
16432static bool isV2F16(Type *Ty) {
16433 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16434 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16435}
16436
16437static bool isV2BF16(Type *Ty) {
16438 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16439 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16440}
16441
16442/// \return true if atomicrmw integer ops work for the type.
16443static bool isAtomicRMWLegalIntTy(Type *Ty) {
16444 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16445 unsigned BW = IT->getBitWidth();
16446 return BW == 32 || BW == 64;
16447 }
16448
16449 return false;
16450}
16451
16452/// \return true if this atomicrmw xchg type can be selected.
16453static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16454 Type *Ty = RMW->getType();
16455 if (isAtomicRMWLegalIntTy(Ty))
16456 return true;
16457
16458 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16459 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16460 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16461 return BW == 32 || BW == 64;
16462 }
16463
16464 if (Ty->isFloatTy() || Ty->isDoubleTy())
16465 return true;
16466
16467 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16468 return VT->getNumElements() == 2 &&
16469 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16470 }
16471
16472 return false;
16473}
16474
16475/// \returns true if it's valid to emit a native instruction for \p RMW, based
16476/// on the properties of the target memory.
16477static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16478 const AtomicRMWInst *RMW,
16479 bool HasSystemScope) {
16480 // The remote/fine-grained access logic is different from the integer
16481 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16482 // fine-grained access does not work, even for a device local allocation.
16483 //
16484 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16485 // allocations work.
16486 if (HasSystemScope) {
16488 RMW->hasMetadata("amdgpu.no.remote.memory"))
16489 return true;
16491 return true;
16492
16493 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16494}
16495
16496/// \return Action to perform on AtomicRMWInsts for integer operations.
16499 return isAtomicRMWLegalIntTy(RMW->getType())
16502}
16503
16504/// Return if a flat address space atomicrmw can access private memory.
16506 const MDNode *NoaliasAddrSpaceMD =
16507 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16508 if (!NoaliasAddrSpaceMD)
16509 return true;
16510
16511 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16512 ++I) {
16513 auto *Low = mdconst::extract<ConstantInt>(
16514 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16515 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16516 auto *High = mdconst::extract<ConstantInt>(
16517 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16518 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16519 }
16520 }
16521
16522 return true;
16523}
16524
16527 unsigned AS = RMW->getPointerAddressSpace();
16528 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16530
16531 // 64-bit flat atomics that dynamically reside in private memory will silently
16532 // be dropped.
16533 //
16534 // Note that we will emit a new copy of the original atomic in the expansion,
16535 // which will be incrementally relegalized.
16536 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16537 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16538 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16541
16542 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16544 ORE.emit([=]() {
16545 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16546 });
16547 return Kind;
16548 };
16549
16550 auto SSID = RMW->getSyncScopeID();
16551 bool HasSystemScope =
16552 SSID == SyncScope::System ||
16553 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16554
16555 auto Op = RMW->getOperation();
16556 switch (Op) {
16557 case AtomicRMWInst::Xchg: {
16558 // PCIe supports add and xchg for system atomics.
16559 return isAtomicRMWLegalXChgTy(RMW)
16562 }
16563 case AtomicRMWInst::Add:
16564 case AtomicRMWInst::And:
16568 case AtomicRMWInst::Sub:
16569 case AtomicRMWInst::Or:
16570 case AtomicRMWInst::Xor: {
16571 // Atomic sub/or/xor do not work over PCI express, but atomic add
16572 // does. InstCombine transforms these with 0 to or, so undo that.
16573 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16574 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16575 ConstVal && ConstVal->isNullValue())
16577 }
16578
16580 }
16581 case AtomicRMWInst::FAdd: {
16582 Type *Ty = RMW->getType();
16583
16584 // TODO: Handle REGION_ADDRESS
16585 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16586 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16587 // is fixed to round-to-nearest-even.
16588 //
16589 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16590 // round-to-nearest-even.
16591 //
16592 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16593 // suggests it is OK if the floating-point mode may not match the calling
16594 // thread.
16595 if (Ty->isFloatTy()) {
16598 }
16599
16600 if (Ty->isDoubleTy()) {
16601 // Ignores denormal mode, but we don't consider flushing mandatory.
16604 }
16605
16606 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16608
16610 }
16611
16612 // LDS atomics respect the denormal mode from the mode register.
16613 //
16614 // Traditionally f32 global/buffer memory atomics would unconditionally
16615 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16616 // flush.
16617 //
16618 // On targets with flat atomic fadd, denormals would flush depending on
16619 // whether the target address resides in LDS or global memory. We consider
16620 // this flat-maybe-flush as will-flush.
16621 if (Ty->isFloatTy() &&
16625
16626 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16627 // safe. The message phrasing also should be better.
16628 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16629 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16630 // gfx940, gfx12
16631 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16632 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16633 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16634 // gfx90a, gfx940, gfx12
16635 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16636 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16637
16638 // gfx940, gfx12
16639 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16640 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16641 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16642 // gfx90a, gfx940, gfx12
16643 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16644 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16645
16646 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16647 // buffer. gfx12 does have the buffer version.
16648 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16649 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16650 }
16651
16652 // global and flat atomic fadd f64: gfx90a, gfx940.
16653 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16654 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16655
16656 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16657 if (Ty->isFloatTy()) {
16658 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16659 // gfx11+.
16660 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16661 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16662 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16663 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16664 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16665 } else {
16666 // gfx908
16667 if (RMW->use_empty() &&
16669 isV2F16(Ty))
16670 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16671 }
16672 }
16673
16674 // flat atomic fadd f32: gfx940, gfx11+.
16675 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16676 if (Subtarget->hasFlatAtomicFaddF32Inst())
16677 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16678
16679 // If it is in flat address space, and the type is float, we will try to
16680 // expand it, if the target supports global and lds atomic fadd. The
16681 // reason we need that is, in the expansion, we emit the check of
16682 // address space. If it is in global address space, we emit the global
16683 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16684 // fadd.
16685 if (Subtarget->hasLDSFPAtomicAddF32()) {
16686 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16688 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16690 }
16691 }
16692 }
16693
16695 }
16697 case AtomicRMWInst::FMax: {
16698 Type *Ty = RMW->getType();
16699
16700 // LDS float and double fmin/fmax were always supported.
16701 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16702 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16704 }
16705
16706 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16707 // For flat and global cases:
16708 // float, double in gfx7. Manual claims denormal support.
16709 // Removed in gfx8.
16710 // float, double restored in gfx10.
16711 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16712 //
16713 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16714 // no f32.
16715 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16716 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16717 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16718 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16719 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16720 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16722 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16723 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16724 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16725 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16726 }
16727 }
16728
16730 }
16731 case AtomicRMWInst::Min:
16732 case AtomicRMWInst::Max:
16734 case AtomicRMWInst::UMax: {
16737 // Always expand system scope min/max atomics.
16738 if (HasSystemScope)
16740 }
16741
16743 }
16746 default:
16748 }
16749
16750 llvm_unreachable("covered atomicrmw op switch");
16751}
16752
16758}
16759
16762 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16765}
16766
16769 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16770 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16772
16773 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16775
16776 const DataLayout &DL = CmpX->getDataLayout();
16777
16778 Type *ValTy = CmpX->getNewValOperand()->getType();
16779
16780 // If a 64-bit flat atomic may alias private, we need to avoid using the
16781 // atomic in the private case.
16782 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16784}
16785
16786const TargetRegisterClass *
16787SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16789 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16790 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16791 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16792 : &AMDGPU::SReg_32RegClass;
16793 if (!TRI->isSGPRClass(RC) && !isDivergent)
16794 return TRI->getEquivalentSGPRClass(RC);
16795 if (TRI->isSGPRClass(RC) && isDivergent)
16796 return TRI->getEquivalentVGPRClass(RC);
16797
16798 return RC;
16799}
16800
16801// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16802// uniform values (as produced by the mask results of control flow intrinsics)
16803// used outside of divergent blocks. The phi users need to also be treated as
16804// always uniform.
16805//
16806// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16807static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16808 unsigned WaveSize) {
16809 // FIXME: We assume we never cast the mask results of a control flow
16810 // intrinsic.
16811 // Early exit if the type won't be consistent as a compile time hack.
16812 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16813 if (!IT || IT->getBitWidth() != WaveSize)
16814 return false;
16815
16816 if (!isa<Instruction>(V))
16817 return false;
16818 if (!Visited.insert(V).second)
16819 return false;
16820 bool Result = false;
16821 for (const auto *U : V->users()) {
16822 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16823 if (V == U->getOperand(1)) {
16824 switch (Intrinsic->getIntrinsicID()) {
16825 default:
16826 Result = false;
16827 break;
16828 case Intrinsic::amdgcn_if_break:
16829 case Intrinsic::amdgcn_if:
16830 case Intrinsic::amdgcn_else:
16831 Result = true;
16832 break;
16833 }
16834 }
16835 if (V == U->getOperand(0)) {
16836 switch (Intrinsic->getIntrinsicID()) {
16837 default:
16838 Result = false;
16839 break;
16840 case Intrinsic::amdgcn_end_cf:
16841 case Intrinsic::amdgcn_loop:
16842 Result = true;
16843 break;
16844 }
16845 }
16846 } else {
16847 Result = hasCFUser(U, Visited, WaveSize);
16848 }
16849 if (Result)
16850 break;
16851 }
16852 return Result;
16853}
16854
16856 const Value *V) const {
16857 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16858 if (CI->isInlineAsm()) {
16859 // FIXME: This cannot give a correct answer. This should only trigger in
16860 // the case where inline asm returns mixed SGPR and VGPR results, used
16861 // outside the defining block. We don't have a specific result to
16862 // consider, so this assumes if any value is SGPR, the overall register
16863 // also needs to be SGPR.
16864 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16866 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16867 for (auto &TC : TargetConstraints) {
16868 if (TC.Type == InlineAsm::isOutput) {
16870 const TargetRegisterClass *RC =
16871 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
16872 TC.ConstraintVT)
16873 .second;
16874 if (RC && SIRI->isSGPRClass(RC))
16875 return true;
16876 }
16877 }
16878 }
16879 }
16881 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16882}
16883
16885 for (SDUse &Use : N->uses()) {
16886 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
16887 if (getBasePtrIndex(M) == Use.getOperandNo())
16888 return true;
16889 }
16890 }
16891 return false;
16892}
16893
16895 SDValue N1) const {
16896 if (!N0.hasOneUse())
16897 return false;
16898 // Take care of the opportunity to keep N0 uniform
16899 if (N0->isDivergent() || !N1->isDivergent())
16900 return true;
16901 // Check if we have a good chance to form the memory access pattern with the
16902 // base and offset
16903 return (DAG.isBaseWithConstantOffset(N0) &&
16905}
16906
16908 Register N0, Register N1) const {
16909 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16910}
16911
16914 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16916 if (I.getMetadata("amdgpu.noclobber"))
16917 Flags |= MONoClobber;
16918 if (I.getMetadata("amdgpu.last.use"))
16919 Flags |= MOLastUse;
16920 return Flags;
16921}
16922
16924 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16925 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16926 if (User->getOpcode() != ISD::CopyToReg)
16927 return false;
16928 if (!Def->isMachineOpcode())
16929 return false;
16930 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16931 if (!MDef)
16932 return false;
16933
16934 unsigned ResNo = User->getOperand(Op).getResNo();
16935 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16936 return false;
16937 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16938 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16939 PhysReg = AMDGPU::SCC;
16940 const TargetRegisterClass *RC =
16941 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16942 Cost = RC->getCopyCost();
16943 return true;
16944 }
16945 return false;
16946}
16947
16949 Instruction *AI) const {
16950 // Given: atomicrmw fadd ptr %addr, float %val ordering
16951 //
16952 // With this expansion we produce the following code:
16953 // [...]
16954 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16955 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16956 //
16957 // atomicrmw.shared:
16958 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16959 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16960 // float %val ordering
16961 // br label %atomicrmw.phi
16962 //
16963 // atomicrmw.check.private:
16964 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16965 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16966 //
16967 // atomicrmw.private:
16968 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16969 // %loaded.private = load float, ptr addrspace(5) %cast.private
16970 // %val.new = fadd float %loaded.private, %val
16971 // store float %val.new, ptr addrspace(5) %cast.private
16972 // br label %atomicrmw.phi
16973 //
16974 // atomicrmw.global:
16975 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16976 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16977 // float %val ordering
16978 // br label %atomicrmw.phi
16979 //
16980 // atomicrmw.phi:
16981 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16982 // [ %loaded.private, %atomicrmw.private ],
16983 // [ %loaded.global, %atomicrmw.global ]
16984 // br label %atomicrmw.end
16985 //
16986 // atomicrmw.end:
16987 // [...]
16988 //
16989 //
16990 // For 64-bit atomics which may reside in private memory, we perform a simpler
16991 // version that only inserts the private check, and uses the flat operation.
16992
16993 IRBuilder<> Builder(AI);
16994 LLVMContext &Ctx = Builder.getContext();
16995
16996 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16997 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16999 Value *Addr = AI->getOperand(PtrOpIdx);
17000
17001 /// TODO: Only need to check private, then emit flat-known-not private (no
17002 /// need for shared block, or cast to global).
17003 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17004
17005 Align Alignment;
17006 if (RMW)
17007 Alignment = RMW->getAlign();
17008 else if (CX)
17009 Alignment = CX->getAlign();
17010 else
17011 llvm_unreachable("unhandled atomic operation");
17012
17013 // FullFlatEmulation is true if we need to issue the private, shared, and
17014 // global cases.
17015 //
17016 // If this is false, we are only dealing with the flat-targeting-private case,
17017 // where we only insert a check for private and still use the flat instruction
17018 // for global and shared.
17019
17020 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17021 Subtarget->hasAtomicFaddInsts() &&
17022 RMW->getType()->isFloatTy();
17023
17024 // If the return value isn't used, do not introduce a false use in the phi.
17025 bool ReturnValueIsUsed = !AI->use_empty();
17026
17027 BasicBlock *BB = Builder.GetInsertBlock();
17028 Function *F = BB->getParent();
17029 BasicBlock *ExitBB =
17030 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17031 BasicBlock *SharedBB = nullptr;
17032
17033 BasicBlock *CheckPrivateBB = BB;
17034 if (FullFlatEmulation) {
17035 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17036 CheckPrivateBB =
17037 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17038 }
17039
17040 BasicBlock *PrivateBB =
17041 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17042 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17043 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17044
17045 std::prev(BB->end())->eraseFromParent();
17046 Builder.SetInsertPoint(BB);
17047
17048 Value *LoadedShared = nullptr;
17049 if (FullFlatEmulation) {
17050 CallInst *IsShared = Builder.CreateIntrinsic(
17051 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17052 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17053 Builder.SetInsertPoint(SharedBB);
17054 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17056
17057 Instruction *Clone = AI->clone();
17058 Clone->insertInto(SharedBB, SharedBB->end());
17059 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17060 LoadedShared = Clone;
17061
17062 Builder.CreateBr(PhiBB);
17063 Builder.SetInsertPoint(CheckPrivateBB);
17064 }
17065
17066 CallInst *IsPrivate = Builder.CreateIntrinsic(
17067 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17068 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17069
17070 Builder.SetInsertPoint(PrivateBB);
17071
17072 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17074
17075 Value *LoadedPrivate;
17076 if (RMW) {
17077 LoadedPrivate = Builder.CreateAlignedLoad(
17078 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17079
17080 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17081 LoadedPrivate, RMW->getValOperand());
17082
17083 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17084 } else {
17085 auto [ResultLoad, Equal] =
17086 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17087 CX->getNewValOperand(), CX->getAlign());
17088
17089 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17090 ResultLoad, 0);
17091 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17092 }
17093
17094 Builder.CreateBr(PhiBB);
17095
17096 Builder.SetInsertPoint(GlobalBB);
17097
17098 // Continue using a flat instruction if we only emitted the check for private.
17099 Instruction *LoadedGlobal = AI;
17100 if (FullFlatEmulation) {
17101 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17103 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17104 }
17105
17106 AI->removeFromParent();
17107 AI->insertInto(GlobalBB, GlobalBB->end());
17108
17109 // The new atomicrmw may go through another round of legalization later.
17110 if (!FullFlatEmulation) {
17111 // We inserted the runtime check already, make sure we do not try to
17112 // re-expand this.
17113 // TODO: Should union with any existing metadata.
17114 MDBuilder MDB(F->getContext());
17115 MDNode *RangeNotPrivate =
17118 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17119 RangeNotPrivate);
17120 }
17121
17122 Builder.CreateBr(PhiBB);
17123
17124 Builder.SetInsertPoint(PhiBB);
17125
17126 if (ReturnValueIsUsed) {
17127 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17128 AI->replaceAllUsesWith(Loaded);
17129 if (FullFlatEmulation)
17130 Loaded->addIncoming(LoadedShared, SharedBB);
17131 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17132 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17133 Loaded->takeName(AI);
17134 }
17135
17136 Builder.CreateBr(ExitBB);
17137}
17138
17141
17144 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17145 ConstVal && ConstVal->isNullValue()) {
17146 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17148
17149 // We may still need the private-alias-flat handling below.
17150
17151 // TODO: Skip this for cases where we cannot access remote memory.
17152 }
17153 }
17154
17155 // The non-flat expansions should only perform the de-canonicalization of
17156 // identity values.
17158 return;
17159
17161}
17162
17165}
17166
17167LoadInst *
17169 IRBuilder<> Builder(AI);
17170 auto Order = AI->getOrdering();
17171
17172 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17173 // must be flushed if the atomic ordering had a release semantics. This is
17174 // not necessary a fence, a release fence just coincides to do that flush.
17175 // Avoid replacing of an atomicrmw with a release semantics.
17176 if (isReleaseOrStronger(Order))
17177 return nullptr;
17178
17179 LoadInst *LI = Builder.CreateAlignedLoad(
17180 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17181 LI->setAtomic(Order, AI->getSyncScopeID());
17182 LI->copyMetadata(*AI);
17183 LI->takeName(AI);
17184 AI->replaceAllUsesWith(LI);
17185 AI->eraseFromParent();
17186 return LI;
17187}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1117
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5465
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1484
bool isNegative() const
Definition: APFloat.h:1440
APInt bitcastToAPInt() const
Definition: APFloat.h:1346
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1135
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1095
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1076
bool isInfinity() const
Definition: APFloat.h:1437
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1459
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
unsigned arg_size() const
Definition: InstrTypes.h:1292
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2556
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1809
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:189
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:188
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1158
LLVMContext & getContext() const
Definition: IRBuilder.h:190
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1152
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:194
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1828
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2699
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:748
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:980
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:575
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:799
const Pass * getPass() const
Definition: SelectionDAG.h:491
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:501
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:854
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:825
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:495
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:710
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:496
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:698
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:490
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:871
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:508
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:584
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:578
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1490
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1639
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1606
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1586
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1600
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:235
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:297
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:263
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals