LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426 Custom);
427
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
433 Custom);
434
435 // Deal with vec3 vector operations when widened to vec4.
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
438
439 // Deal with vec5/6/7 vector operations when widened to vec8.
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
450
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
454 Expand);
455
456 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
457
458 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
459
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
461 // illegal.
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
464
465 // On SI this is s_memtime and s_memrealtime on VI.
467
468 if (Subtarget->hasSMemRealTime() ||
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
632 case ISD::IS_FPCLASS:
633 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
744 Subtarget->hasVOP3PInsts() ? Legal : Custom);
745
746 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
750
753
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
788 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
789 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
790 Custom);
791
792 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
793 // Split vector operations.
798 VT, Custom);
799
800 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
801 // Split vector operations.
803 VT, Custom);
804
805 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
806 Custom);
807
808 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
809 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
810 Custom);
811
812 if (Subtarget->hasPackedFP32Ops()) {
814 MVT::v2f32, Legal);
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
817 Custom);
818 }
819 }
820
822
823 if (Subtarget->has16BitInsts()) {
825 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
827 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
828 } else {
829 // Legalization hack.
830 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
831
833 }
834
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
840 Custom);
841
843
844 if (Subtarget->hasScalarSMulU64())
846
847 if (Subtarget->hasMad64_32())
849
850 if (Subtarget->hasPrefetch())
852
853 if (Subtarget->hasIEEEMinMax()) {
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
858 Custom);
859 } else {
860 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
861 if (Subtarget->hasMinimum3Maximum3F32())
863
864 if (Subtarget->hasMinimum3Maximum3PKF16())
866 }
867
869 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
870 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
871 MVT::i8},
872 Custom);
873
875 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
876 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
877 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
878 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
879 Custom);
880
882 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
883 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
884 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
885 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
886 Custom);
887
893
894 // TODO: Could move this to custom lowering, could benefit from combines on
895 // extract of relevant bits.
897
899
900 if (Subtarget->hasBF16ConversionInsts()) {
904 }
905
906 if (Subtarget->hasCvtPkF16F32Inst()) {
908 }
909
912 ISD::SUB,
914 ISD::MUL,
915 ISD::FADD,
916 ISD::FSUB,
917 ISD::FDIV,
918 ISD::FMUL,
925 ISD::FMA,
926 ISD::SMIN,
927 ISD::SMAX,
928 ISD::UMIN,
929 ISD::UMAX,
932 ISD::SMIN,
933 ISD::SMAX,
934 ISD::UMIN,
935 ISD::UMAX,
936 ISD::AND,
937 ISD::OR,
938 ISD::XOR,
939 ISD::SHL,
940 ISD::SRL,
941 ISD::SRA,
942 ISD::FSHR,
952
953 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
955
956 // All memory operations. Some folding on the pointer operand is done to help
957 // matching the constant offsets in the addressing modes.
982
983 // FIXME: In other contexts we pretend this is a per-function property.
985
987}
988
989const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
990
992 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
993 return RCRegs;
994}
995
996//===----------------------------------------------------------------------===//
997// TargetLowering queries
998//===----------------------------------------------------------------------===//
999
1000// v_mad_mix* support a conversion from f16 to f32.
1001//
1002// There is only one special case when denormals are enabled we don't currently,
1003// where this is OK to use.
1004bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1005 EVT DestVT, EVT SrcVT) const {
1006 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1007 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1008 DestVT.getScalarType() == MVT::f32 &&
1009 SrcVT.getScalarType() == MVT::f16 &&
1010 // TODO: This probably only requires no input flushing?
1012}
1013
1015 LLT DestTy, LLT SrcTy) const {
1016 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1017 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1018 DestTy.getScalarSizeInBits() == 32 &&
1019 SrcTy.getScalarSizeInBits() == 16 &&
1020 // TODO: This probably only requires no input flushing?
1021 denormalModeIsFlushAllF32(*MI.getMF());
1022}
1023
1025 // SI has some legal vector types, but no legal vector operations. Say no
1026 // shuffles are legal in order to prefer scalarizing some vector operations.
1027 return false;
1028}
1029
1032 EVT VT) const {
1035
1036 if (VT.isVector()) {
1037 EVT ScalarVT = VT.getScalarType();
1038 unsigned Size = ScalarVT.getSizeInBits();
1039 if (Size == 16) {
1040 if (Subtarget->has16BitInsts()) {
1041 if (VT.isInteger())
1042 return MVT::v2i16;
1043 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1044 }
1045 return VT.isInteger() ? MVT::i32 : MVT::f32;
1046 }
1047
1048 if (Size < 16)
1049 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1050 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1051 }
1052
1053 if (VT.getSizeInBits() > 32)
1054 return MVT::i32;
1055
1057}
1058
1061 EVT VT) const {
1064
1065 if (VT.isVector()) {
1066 unsigned NumElts = VT.getVectorNumElements();
1067 EVT ScalarVT = VT.getScalarType();
1068 unsigned Size = ScalarVT.getSizeInBits();
1069
1070 // FIXME: Should probably promote 8-bit vectors to i16.
1071 if (Size == 16 && Subtarget->has16BitInsts())
1072 return (NumElts + 1) / 2;
1073
1074 if (Size <= 32)
1075 return NumElts;
1076
1077 if (Size > 32)
1078 return NumElts * ((Size + 31) / 32);
1079 } else if (VT.getSizeInBits() > 32)
1080 return (VT.getSizeInBits() + 31) / 32;
1081
1083}
1084
1086 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1087 unsigned &NumIntermediates, MVT &RegisterVT) const {
1088 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1089 unsigned NumElts = VT.getVectorNumElements();
1090 EVT ScalarVT = VT.getScalarType();
1091 unsigned Size = ScalarVT.getSizeInBits();
1092 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1093 // support, but unless we can properly handle 3-vectors, it will be still be
1094 // inconsistent.
1095 if (Size == 16 && Subtarget->has16BitInsts()) {
1096 if (ScalarVT == MVT::bf16) {
1097 RegisterVT = MVT::i32;
1098 IntermediateVT = MVT::v2bf16;
1099 } else {
1100 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1101 IntermediateVT = RegisterVT;
1102 }
1103 NumIntermediates = (NumElts + 1) / 2;
1104 return NumIntermediates;
1105 }
1106
1107 if (Size == 32) {
1108 RegisterVT = ScalarVT.getSimpleVT();
1109 IntermediateVT = RegisterVT;
1110 NumIntermediates = NumElts;
1111 return NumIntermediates;
1112 }
1113
1114 if (Size < 16 && Subtarget->has16BitInsts()) {
1115 // FIXME: Should probably form v2i16 pieces
1116 RegisterVT = MVT::i16;
1117 IntermediateVT = ScalarVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1120 }
1121
1122 if (Size != 16 && Size <= 32) {
1123 RegisterVT = MVT::i32;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1127 }
1128
1129 if (Size > 32) {
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = RegisterVT;
1132 NumIntermediates = NumElts * ((Size + 31) / 32);
1133 return NumIntermediates;
1134 }
1135 }
1136
1138 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1139}
1140
1142 const DataLayout &DL, Type *Ty,
1143 unsigned MaxNumLanes) {
1144 assert(MaxNumLanes != 0);
1145
1146 LLVMContext &Ctx = Ty->getContext();
1147 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1148 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1149 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1150 NumElts);
1151 }
1152
1153 return TLI.getValueType(DL, Ty);
1154}
1155
1156// Peek through TFE struct returns to only use the data size.
1158 const DataLayout &DL, Type *Ty,
1159 unsigned MaxNumLanes) {
1160 auto *ST = dyn_cast<StructType>(Ty);
1161 if (!ST)
1162 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1163
1164 // TFE intrinsics return an aggregate type.
1165 assert(ST->getNumContainedTypes() == 2 &&
1166 ST->getContainedType(1)->isIntegerTy(32));
1167 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1168}
1169
1170/// Map address space 7 to MVT::v5i32 because that's its in-memory
1171/// representation. This return value is vector-typed because there is no
1172/// MVT::i160 and it is not clear if one can be added. While this could
1173/// cause issues during codegen, these address space 7 pointers will be
1174/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1175/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1176/// modeling, to work.
1178 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1179 return MVT::v5i32;
1181 DL.getPointerSizeInBits(AS) == 192)
1182 return MVT::v6i32;
1184}
1185/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1186/// v8i32 when padding is added.
1187/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1188/// also v8i32 with padding.
1190 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1191 DL.getPointerSizeInBits(AS) == 160) ||
1193 DL.getPointerSizeInBits(AS) == 192))
1194 return MVT::v8i32;
1196}
1197
1199 const CallInst &CI,
1200 MachineFunction &MF,
1201 unsigned IntrID) const {
1203 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1205
1206 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1208 AttributeList Attr =
1210 MemoryEffects ME = Attr.getMemoryEffects();
1211 if (ME.doesNotAccessMemory())
1212 return false;
1213
1214 // TODO: Should images get their own address space?
1215 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1216
1217 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1218 if (RsrcIntr->IsImage) {
1221 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1222 Info.align.reset();
1223 }
1224
1225 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1226 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1227 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1228 // We conservatively set the memory operand of a buffer intrinsic to the
1229 // base resource pointer, so that we can access alias information about
1230 // those pointers. Cases like "this points at the same value
1231 // but with a different offset" are handled in
1232 // areMemAccessesTriviallyDisjoint.
1233 Info.ptrVal = RsrcArg;
1234 }
1235
1236 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1237 if (!IsSPrefetch) {
1238 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1239 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1241 }
1242
1244 if (ME.onlyReadsMemory()) {
1245 if (RsrcIntr->IsImage) {
1246 unsigned MaxNumLanes = 4;
1247
1248 if (!BaseOpcode->Gather4) {
1249 // If this isn't a gather, we may have excess loaded elements in the
1250 // IR type. Check the dmask for the real number of elements loaded.
1251 unsigned DMask =
1252 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1253 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1254 }
1255
1256 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1257 CI.getType(), MaxNumLanes);
1258 } else {
1259 Info.memVT =
1261 std::numeric_limits<unsigned>::max());
1262 }
1263
1264 // FIXME: What does alignment mean for an image?
1267 } else if (ME.onlyWritesMemory()) {
1269
1270 Type *DataTy = CI.getArgOperand(0)->getType();
1271 if (RsrcIntr->IsImage) {
1272 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1273 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1274 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1275 DMaskLanes);
1276 } else
1277 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1278
1280 } else {
1281 // Atomic, NoReturn Sampler or prefetch
1284 Info.flags |=
1286
1287 if (!IsSPrefetch)
1289
1290 switch (IntrID) {
1291 default:
1292 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1293 // Fake memory access type for no return sampler intrinsics
1294 Info.memVT = MVT::i32;
1295 } else {
1296 // XXX - Should this be volatile without known ordering?
1298 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1299 }
1300 break;
1301 case Intrinsic::amdgcn_raw_buffer_load_lds:
1302 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_buffer_load_lds:
1304 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1305 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1306 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1307 Info.ptrVal = CI.getArgOperand(1);
1308 return true;
1309 }
1310 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1311 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1313 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1314 Info.memVT =
1316 std::numeric_limits<unsigned>::max());
1317 Info.flags &= ~MachineMemOperand::MOStore;
1318 return true;
1319 }
1320 }
1321 }
1322 return true;
1323 }
1324
1325 switch (IntrID) {
1326 case Intrinsic::amdgcn_ds_ordered_add:
1327 case Intrinsic::amdgcn_ds_ordered_swap: {
1329 Info.memVT = MVT::getVT(CI.getType());
1330 Info.ptrVal = CI.getOperand(0);
1331 Info.align.reset();
1333
1334 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1335 if (!Vol->isZero())
1337
1338 return true;
1339 }
1340 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1341 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1343 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1344 Info.ptrVal = nullptr;
1345 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1347 return true;
1348 }
1349 case Intrinsic::amdgcn_ds_append:
1350 case Intrinsic::amdgcn_ds_consume: {
1352 Info.memVT = MVT::getVT(CI.getType());
1353 Info.ptrVal = CI.getOperand(0);
1354 Info.align.reset();
1356
1357 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1358 if (!Vol->isZero())
1360
1361 return true;
1362 }
1363 case Intrinsic::amdgcn_global_atomic_csub: {
1365 Info.memVT = MVT::getVT(CI.getType());
1366 Info.ptrVal = CI.getOperand(0);
1367 Info.align.reset();
1370 return true;
1371 }
1372 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1374 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1375
1376 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1377 Info.align.reset();
1378 Info.flags |=
1380 return true;
1381 }
1382 case Intrinsic::amdgcn_global_atomic_fmin_num:
1383 case Intrinsic::amdgcn_global_atomic_fmax_num:
1384 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1385 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1386 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1387 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1389 Info.memVT = MVT::getVT(CI.getType());
1390 Info.ptrVal = CI.getOperand(0);
1391 Info.align.reset();
1395 return true;
1396 }
1397 case Intrinsic::amdgcn_global_load_tr_b64:
1398 case Intrinsic::amdgcn_global_load_tr_b128:
1399 case Intrinsic::amdgcn_ds_read_tr4_b64:
1400 case Intrinsic::amdgcn_ds_read_tr6_b96:
1401 case Intrinsic::amdgcn_ds_read_tr8_b64:
1402 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1404 Info.memVT = MVT::getVT(CI.getType());
1405 Info.ptrVal = CI.getOperand(0);
1406 Info.align.reset();
1408 return true;
1409 }
1410 case Intrinsic::amdgcn_ds_gws_init:
1411 case Intrinsic::amdgcn_ds_gws_barrier:
1412 case Intrinsic::amdgcn_ds_gws_sema_v:
1413 case Intrinsic::amdgcn_ds_gws_sema_br:
1414 case Intrinsic::amdgcn_ds_gws_sema_p:
1415 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1417
1418 const GCNTargetMachine &TM =
1419 static_cast<const GCNTargetMachine &>(getTargetMachine());
1420
1422 Info.ptrVal = MFI->getGWSPSV(TM);
1423
1424 // This is an abstract access, but we need to specify a type and size.
1425 Info.memVT = MVT::i32;
1426 Info.size = 4;
1427 Info.align = Align(4);
1428
1429 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1431 else
1433 return true;
1434 }
1435 case Intrinsic::amdgcn_global_load_lds: {
1437 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1438 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1439 Info.ptrVal = CI.getArgOperand(1);
1441 return true;
1442 }
1443 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1445
1446 const GCNTargetMachine &TM =
1447 static_cast<const GCNTargetMachine &>(getTargetMachine());
1448
1450 Info.ptrVal = MFI->getGWSPSV(TM);
1451
1452 // This is an abstract access, but we need to specify a type and size.
1453 Info.memVT = MVT::i32;
1454 Info.size = 4;
1455 Info.align = Align(4);
1456
1458 return true;
1459 }
1460 case Intrinsic::amdgcn_s_prefetch_data: {
1462 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1463 Info.ptrVal = CI.getArgOperand(0);
1465 return true;
1466 }
1467 default:
1468 return false;
1469 }
1470}
1471
1473 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1474 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1475 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1476 // The DAG's ValueType loses the addrspaces.
1477 // Add them as 2 extra Constant operands "from" and "to".
1478 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1479 unsigned DstAS = I.getType()->getPointerAddressSpace();
1480 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1481 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1482 break;
1483 }
1484 default:
1485 break;
1486 }
1487}
1488
1491 Type *&AccessTy) const {
1492 Value *Ptr = nullptr;
1493 switch (II->getIntrinsicID()) {
1494 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1495 case Intrinsic::amdgcn_ds_append:
1496 case Intrinsic::amdgcn_ds_consume:
1497 case Intrinsic::amdgcn_ds_read_tr4_b64:
1498 case Intrinsic::amdgcn_ds_read_tr6_b96:
1499 case Intrinsic::amdgcn_ds_read_tr8_b64:
1500 case Intrinsic::amdgcn_ds_read_tr16_b64:
1501 case Intrinsic::amdgcn_ds_ordered_add:
1502 case Intrinsic::amdgcn_ds_ordered_swap:
1503 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1504 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1505 case Intrinsic::amdgcn_global_atomic_csub:
1506 case Intrinsic::amdgcn_global_atomic_fmax_num:
1507 case Intrinsic::amdgcn_global_atomic_fmin_num:
1508 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b64:
1510 case Intrinsic::amdgcn_global_load_tr_b128:
1511 Ptr = II->getArgOperand(0);
1512 break;
1513 case Intrinsic::amdgcn_global_load_lds:
1514 Ptr = II->getArgOperand(1);
1515 break;
1516 default:
1517 return false;
1518 }
1519 AccessTy = II->getType();
1520 Ops.push_back(Ptr);
1521 return true;
1522}
1523
1525 unsigned AddrSpace) const {
1526 if (!Subtarget->hasFlatInstOffsets()) {
1527 // Flat instructions do not have offsets, and only have the register
1528 // address.
1529 return AM.BaseOffs == 0 && AM.Scale == 0;
1530 }
1531
1532 decltype(SIInstrFlags::FLAT) FlatVariant =
1536
1537 return AM.Scale == 0 &&
1538 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1539 AM.BaseOffs, AddrSpace, FlatVariant));
1540}
1541
1543 if (Subtarget->hasFlatGlobalInsts())
1545
1546 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1547 // Assume the we will use FLAT for all global memory accesses
1548 // on VI.
1549 // FIXME: This assumption is currently wrong. On VI we still use
1550 // MUBUF instructions for the r + i addressing mode. As currently
1551 // implemented, the MUBUF instructions only work on buffer < 4GB.
1552 // It may be possible to support > 4GB buffers with MUBUF instructions,
1553 // by setting the stride value in the resource descriptor which would
1554 // increase the size limit to (stride * 4GB). However, this is risky,
1555 // because it has never been validated.
1557 }
1558
1559 return isLegalMUBUFAddressingMode(AM);
1560}
1561
1562bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1563 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1564 // additionally can do r + r + i with addr64. 32-bit has more addressing
1565 // mode options. Depending on the resource constant, it can also do
1566 // (i64 r0) + (i32 r1) * (i14 i).
1567 //
1568 // Private arrays end up using a scratch buffer most of the time, so also
1569 // assume those use MUBUF instructions. Scratch loads / stores are currently
1570 // implemented as mubuf instructions with offen bit set, so slightly
1571 // different than the normal addr64.
1572 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1573 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1574 return false;
1575
1576 // FIXME: Since we can split immediate into soffset and immediate offset,
1577 // would it make sense to allow any immediate?
1578
1579 switch (AM.Scale) {
1580 case 0: // r + i or just i, depending on HasBaseReg.
1581 return true;
1582 case 1:
1583 return true; // We have r + r or r + i.
1584 case 2:
1585 if (AM.HasBaseReg) {
1586 // Reject 2 * r + r.
1587 return false;
1588 }
1589
1590 // Allow 2 * r as r + r
1591 // Or 2 * r + i is allowed as r + r + i.
1592 return true;
1593 default: // Don't allow n * r
1594 return false;
1595 }
1596}
1597
1599 const AddrMode &AM, Type *Ty,
1600 unsigned AS,
1601 Instruction *I) const {
1602 // No global is ever allowed as a base.
1603 if (AM.BaseGV)
1604 return false;
1605
1606 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1607 return isLegalGlobalAddressingMode(AM);
1608
1609 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1613 // If the offset isn't a multiple of 4, it probably isn't going to be
1614 // correctly aligned.
1615 // FIXME: Can we get the real alignment here?
1616 if (AM.BaseOffs % 4 != 0)
1617 return isLegalMUBUFAddressingMode(AM);
1618
1619 if (!Subtarget->hasScalarSubwordLoads()) {
1620 // There are no SMRD extloads, so if we have to do a small type access we
1621 // will use a MUBUF load.
1622 // FIXME?: We also need to do this if unaligned, but we don't know the
1623 // alignment here.
1624 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1625 return isLegalGlobalAddressingMode(AM);
1626 }
1627
1629 // SMRD instructions have an 8-bit, dword offset on SI.
1630 if (!isUInt<8>(AM.BaseOffs / 4))
1631 return false;
1632 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1633 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1634 // in 8-bits, it can use a smaller encoding.
1635 if (!isUInt<32>(AM.BaseOffs / 4))
1636 return false;
1637 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1638 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1639 if (!isUInt<20>(AM.BaseOffs))
1640 return false;
1641 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1642 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1643 // for S_BUFFER_* instructions).
1644 if (!isInt<21>(AM.BaseOffs))
1645 return false;
1646 } else {
1647 // On GFX12, all offsets are signed 24-bit in bytes.
1648 if (!isInt<24>(AM.BaseOffs))
1649 return false;
1650 }
1651
1652 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1654 AM.BaseOffs < 0) {
1655 // Scalar (non-buffer) loads can only use a negative offset if
1656 // soffset+offset is non-negative. Since the compiler can only prove that
1657 // in a few special cases, it is safer to claim that negative offsets are
1658 // not supported.
1659 return false;
1660 }
1661
1662 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1663 return true;
1664
1665 if (AM.Scale == 1 && AM.HasBaseReg)
1666 return true;
1667
1668 return false;
1669 }
1670
1671 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1672 return Subtarget->enableFlatScratch()
1674 : isLegalMUBUFAddressingMode(AM);
1675
1676 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1677 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1678 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1679 // field.
1680 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1681 // an 8-bit dword offset but we don't know the alignment here.
1682 if (!isUInt<16>(AM.BaseOffs))
1683 return false;
1684
1685 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1686 return true;
1687
1688 if (AM.Scale == 1 && AM.HasBaseReg)
1689 return true;
1690
1691 return false;
1692 }
1693
1695 // For an unknown address space, this usually means that this is for some
1696 // reason being used for pure arithmetic, and not based on some addressing
1697 // computation. We don't have instructions that compute pointers with any
1698 // addressing modes, so treat them as having no offset like flat
1699 // instructions.
1701 }
1702
1703 // Assume a user alias of global for unknown address spaces.
1704 return isLegalGlobalAddressingMode(AM);
1705}
1706
1708 const MachineFunction &MF) const {
1710 return (MemVT.getSizeInBits() <= 4 * 32);
1711 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1712 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1713 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1714 }
1716 return (MemVT.getSizeInBits() <= 2 * 32);
1717 return true;
1718}
1719
1721 unsigned Size, unsigned AddrSpace, Align Alignment,
1722 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1723 if (IsFast)
1724 *IsFast = 0;
1725
1726 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1727 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1728 // Check if alignment requirements for ds_read/write instructions are
1729 // disabled.
1730 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1731 return false;
1732
1733 Align RequiredAlignment(
1734 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1735 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1736 Alignment < RequiredAlignment)
1737 return false;
1738
1739 // Either, the alignment requirements are "enabled", or there is an
1740 // unaligned LDS access related hardware bug though alignment requirements
1741 // are "disabled". In either case, we need to check for proper alignment
1742 // requirements.
1743 //
1744 switch (Size) {
1745 case 64:
1746 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1747 // address is negative, then the instruction is incorrectly treated as
1748 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1749 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1750 // load later in the SILoadStoreOptimizer.
1751 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1752 return false;
1753
1754 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1755 // can do a 4 byte aligned, 8 byte access in a single operation using
1756 // ds_read2/write2_b32 with adjacent offsets.
1757 RequiredAlignment = Align(4);
1758
1759 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1760 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1761 // ds_write2_b32 depending on the alignment. In either case with either
1762 // alignment there is no faster way of doing this.
1763
1764 // The numbers returned here and below are not additive, it is a 'speed
1765 // rank'. They are just meant to be compared to decide if a certain way
1766 // of lowering an operation is faster than another. For that purpose
1767 // naturally aligned operation gets it bitsize to indicate that "it
1768 // operates with a speed comparable to N-bit wide load". With the full
1769 // alignment ds128 is slower than ds96 for example. If underaligned it
1770 // is comparable to a speed of a single dword access, which would then
1771 // mean 32 < 128 and it is faster to issue a wide load regardless.
1772 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1773 // wider load which will not be aligned anymore the latter is slower.
1774 if (IsFast)
1775 *IsFast = (Alignment >= RequiredAlignment) ? 64
1776 : (Alignment < Align(4)) ? 32
1777 : 1;
1778 return true;
1779 }
1780
1781 break;
1782 case 96:
1783 if (!Subtarget->hasDS96AndDS128())
1784 return false;
1785
1786 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1787 // gfx8 and older.
1788
1789 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1790 // Naturally aligned access is fastest. However, also report it is Fast
1791 // if memory is aligned less than DWORD. A narrow load or store will be
1792 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1793 // be more of them, so overall we will pay less penalty issuing a single
1794 // instruction.
1795
1796 // See comment on the values above.
1797 if (IsFast)
1798 *IsFast = (Alignment >= RequiredAlignment) ? 96
1799 : (Alignment < Align(4)) ? 32
1800 : 1;
1801 return true;
1802 }
1803
1804 break;
1805 case 128:
1806 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1807 return false;
1808
1809 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1810 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1811 // single operation using ds_read2/write2_b64.
1812 RequiredAlignment = Align(8);
1813
1814 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1815 // Naturally aligned access is fastest. However, also report it is Fast
1816 // if memory is aligned less than DWORD. A narrow load or store will be
1817 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1818 // will be more of them, so overall we will pay less penalty issuing a
1819 // single instruction.
1820
1821 // See comment on the values above.
1822 if (IsFast)
1823 *IsFast = (Alignment >= RequiredAlignment) ? 128
1824 : (Alignment < Align(4)) ? 32
1825 : 1;
1826 return true;
1827 }
1828
1829 break;
1830 default:
1831 if (Size > 32)
1832 return false;
1833
1834 break;
1835 }
1836
1837 // See comment on the values above.
1838 // Note that we have a single-dword or sub-dword here, so if underaligned
1839 // it is a slowest possible access, hence returned value is 0.
1840 if (IsFast)
1841 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1842
1843 return Alignment >= RequiredAlignment ||
1844 Subtarget->hasUnalignedDSAccessEnabled();
1845 }
1846
1847 // FIXME: We have to be conservative here and assume that flat operations
1848 // will access scratch. If we had access to the IR function, then we
1849 // could determine if any private memory was used in the function.
1850 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1851 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1852 bool AlignedBy4 = Alignment >= Align(4);
1853 if (IsFast)
1854 *IsFast = AlignedBy4;
1855
1856 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1857 }
1858
1859 // So long as they are correct, wide global memory operations perform better
1860 // than multiple smaller memory ops -- even when misaligned
1861 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1862 if (IsFast)
1863 *IsFast = Size;
1864
1865 return Alignment >= Align(4) ||
1867 }
1868
1869 // Smaller than dword value must be aligned.
1870 if (Size < 32)
1871 return false;
1872
1873 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1874 // byte-address are ignored, thus forcing Dword alignment.
1875 // This applies to private, global, and constant memory.
1876 if (IsFast)
1877 *IsFast = 1;
1878
1879 return Size >= 32 && Alignment >= Align(4);
1880}
1881
1883 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1884 unsigned *IsFast) const {
1886 Alignment, Flags, IsFast);
1887}
1888
1890 const MemOp &Op, const AttributeList &FuncAttributes) const {
1891 // FIXME: Should account for address space here.
1892
1893 // The default fallback uses the private pointer size as a guess for a type to
1894 // use. Make sure we switch these to 64-bit accesses.
1895
1896 if (Op.size() >= 16 &&
1897 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1898 return MVT::v4i32;
1899
1900 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1901 return MVT::v2i32;
1902
1903 // Use the default.
1904 return MVT::Other;
1905}
1906
1908 const MemSDNode *MemNode = cast<MemSDNode>(N);
1909 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1910}
1911
1913 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1915}
1916
1918 unsigned DestAS) const {
1919 // Flat -> private/local is a simple truncate.
1920 // Flat -> global is no-op
1921 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1922 return true;
1923
1924 const GCNTargetMachine &TM =
1925 static_cast<const GCNTargetMachine &>(getTargetMachine());
1926 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1927}
1928
1931 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1932 VT.getScalarType().bitsLE(MVT::i16))
1935}
1936
1938 Type *Ty) const {
1939 // FIXME: Could be smarter if called for vector constants.
1940 return true;
1941}
1942
1944 unsigned Index) const {
1946 return false;
1947
1948 // TODO: Add more cases that are cheap.
1949 return Index == 0;
1950}
1951
1953 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1954 switch (Op) {
1955 case ISD::LOAD:
1956 case ISD::STORE:
1957 return true;
1958 default:
1959 return false;
1960 }
1961 }
1962
1963 // SimplifySetCC uses this function to determine whether or not it should
1964 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1965 if (VT == MVT::i1 && Op == ISD::SETCC)
1966 return false;
1967
1969}
1970
1971SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1972 const SDLoc &SL,
1973 SDValue Chain,
1974 uint64_t Offset) const {
1975 const DataLayout &DL = DAG.getDataLayout();
1979
1980 auto [InputPtrReg, RC, ArgTy] =
1982
1983 // We may not have the kernarg segment argument if we have no kernel
1984 // arguments.
1985 if (!InputPtrReg)
1986 return DAG.getConstant(Offset, SL, PtrVT);
1987
1989 SDValue BasePtr = DAG.getCopyFromReg(
1990 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1991
1992 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1993}
1994
1995SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1996 const SDLoc &SL) const {
1999 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2000}
2001
2002SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2003 const SDLoc &SL) const {
2004
2006 std::optional<uint32_t> KnownSize =
2008 if (KnownSize.has_value())
2009 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2010 return SDValue();
2011}
2012
2013SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2014 const SDLoc &SL, SDValue Val,
2015 bool Signed,
2016 const ISD::InputArg *Arg) const {
2017 // First, if it is a widened vector, narrow it.
2018 if (VT.isVector() &&
2020 EVT NarrowedVT =
2023 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2024 DAG.getConstant(0, SL, MVT::i32));
2025 }
2026
2027 // Then convert the vector elements or scalar value.
2028 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2029 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2030 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2031 }
2032
2033 if (MemVT.isFloatingPoint())
2034 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2035 else if (Signed)
2036 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2037 else
2038 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2039
2040 return Val;
2041}
2042
2043SDValue SITargetLowering::lowerKernargMemParameter(
2044 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2045 uint64_t Offset, Align Alignment, bool Signed,
2046 const ISD::InputArg *Arg) const {
2048
2049 // Try to avoid using an extload by loading earlier than the argument address,
2050 // and extracting the relevant bits. The load should hopefully be merged with
2051 // the previous argument.
2052 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2053 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2054 int64_t AlignDownOffset = alignDown(Offset, 4);
2055 int64_t OffsetDiff = Offset - AlignDownOffset;
2056
2057 EVT IntVT = MemVT.changeTypeToInteger();
2058
2059 // TODO: If we passed in the base kernel offset we could have a better
2060 // alignment than 4, but we don't really need it.
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2062 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2065
2066 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2067 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2068
2069 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2070 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2071 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2072
2073 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2074 }
2075
2076 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2077 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2080
2081 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2082 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2083}
2084
2085SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2086 CCValAssign &VA, const SDLoc &SL,
2087 SDValue Chain,
2088 const ISD::InputArg &Arg) const {
2090 MachineFrameInfo &MFI = MF.getFrameInfo();
2091
2092 if (Arg.Flags.isByVal()) {
2093 unsigned Size = Arg.Flags.getByValSize();
2094 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2095 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2096 }
2097
2098 unsigned ArgOffset = VA.getLocMemOffset();
2099 unsigned ArgSize = VA.getValVT().getStoreSize();
2100
2101 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2102
2103 // Create load nodes to retrieve arguments from the stack.
2104 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2105 SDValue ArgValue;
2106
2107 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2109 MVT MemVT = VA.getValVT();
2110
2111 switch (VA.getLocInfo()) {
2112 default:
2113 break;
2114 case CCValAssign::BCvt:
2115 MemVT = VA.getLocVT();
2116 break;
2117 case CCValAssign::SExt:
2118 ExtType = ISD::SEXTLOAD;
2119 break;
2120 case CCValAssign::ZExt:
2121 ExtType = ISD::ZEXTLOAD;
2122 break;
2123 case CCValAssign::AExt:
2124 ExtType = ISD::EXTLOAD;
2125 break;
2126 }
2127
2128 ArgValue = DAG.getExtLoad(
2129 ExtType, SL, VA.getLocVT(), Chain, FIN,
2131 return ArgValue;
2132}
2133
2134SDValue SITargetLowering::getPreloadedValue(
2135 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2137 const ArgDescriptor *Reg = nullptr;
2138 const TargetRegisterClass *RC;
2139 LLT Ty;
2140
2142 const ArgDescriptor WorkGroupIDX =
2143 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2144 // If GridZ is not programmed in an entry function then the hardware will set
2145 // it to all zeros, so there is no need to mask the GridY value in the low
2146 // order bits.
2147 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2148 AMDGPU::TTMP7,
2149 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2150 const ArgDescriptor WorkGroupIDZ =
2151 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2152 if (Subtarget->hasArchitectedSGPRs() &&
2154 switch (PVID) {
2156 Reg = &WorkGroupIDX;
2157 RC = &AMDGPU::SReg_32RegClass;
2158 Ty = LLT::scalar(32);
2159 break;
2161 Reg = &WorkGroupIDY;
2162 RC = &AMDGPU::SReg_32RegClass;
2163 Ty = LLT::scalar(32);
2164 break;
2166 Reg = &WorkGroupIDZ;
2167 RC = &AMDGPU::SReg_32RegClass;
2168 Ty = LLT::scalar(32);
2169 break;
2170 default:
2171 break;
2172 }
2173 }
2174
2175 if (!Reg)
2176 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2177 if (!Reg) {
2179 // It's possible for a kernarg intrinsic call to appear in a kernel with
2180 // no allocated segment, in which case we do not add the user sgpr
2181 // argument, so just return null.
2182 return DAG.getConstant(0, SDLoc(), VT);
2183 }
2184
2185 // It's undefined behavior if a function marked with the amdgpu-no-*
2186 // attributes uses the corresponding intrinsic.
2187 return DAG.getUNDEF(VT);
2188 }
2189
2190 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2191}
2192
2194 CallingConv::ID CallConv,
2195 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2196 FunctionType *FType,
2197 SIMachineFunctionInfo *Info) {
2198 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2199 const ISD::InputArg *Arg = &Ins[I];
2200
2201 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2202 "vector type argument should have been split");
2203
2204 // First check if it's a PS input addr.
2205 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2206 PSInputNum <= 15) {
2207 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2208
2209 // Inconveniently only the first part of the split is marked as isSplit,
2210 // so skip to the end. We only want to increment PSInputNum once for the
2211 // entire split argument.
2212 if (Arg->Flags.isSplit()) {
2213 while (!Arg->Flags.isSplitEnd()) {
2214 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2215 "unexpected vector split in ps argument type");
2216 if (!SkipArg)
2217 Splits.push_back(*Arg);
2218 Arg = &Ins[++I];
2219 }
2220 }
2221
2222 if (SkipArg) {
2223 // We can safely skip PS inputs.
2224 Skipped.set(Arg->getOrigArgIndex());
2225 ++PSInputNum;
2226 continue;
2227 }
2228
2229 Info->markPSInputAllocated(PSInputNum);
2230 if (Arg->Used)
2231 Info->markPSInputEnabled(PSInputNum);
2232
2233 ++PSInputNum;
2234 }
2235
2236 Splits.push_back(*Arg);
2237 }
2238}
2239
2240// Allocate special inputs passed in VGPRs.
2242 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2243 SIMachineFunctionInfo &Info) const {
2244 const LLT S32 = LLT::scalar(32);
2246
2247 if (Info.hasWorkItemIDX()) {
2248 Register Reg = AMDGPU::VGPR0;
2249 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2250
2251 CCInfo.AllocateReg(Reg);
2252 unsigned Mask =
2253 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2254 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2255 }
2256
2257 if (Info.hasWorkItemIDY()) {
2258 assert(Info.hasWorkItemIDX());
2259 if (Subtarget->hasPackedTID()) {
2260 Info.setWorkItemIDY(
2261 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2262 } else {
2263 unsigned Reg = AMDGPU::VGPR1;
2264 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2265
2266 CCInfo.AllocateReg(Reg);
2267 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2268 }
2269 }
2270
2271 if (Info.hasWorkItemIDZ()) {
2272 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2273 if (Subtarget->hasPackedTID()) {
2274 Info.setWorkItemIDZ(
2275 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2276 } else {
2277 unsigned Reg = AMDGPU::VGPR2;
2278 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2279
2280 CCInfo.AllocateReg(Reg);
2281 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2282 }
2283 }
2284}
2285
2286// Try to allocate a VGPR at the end of the argument list, or if no argument
2287// VGPRs are left allocating a stack slot.
2288// If \p Mask is is given it indicates bitfield position in the register.
2289// If \p Arg is given use it with new ]p Mask instead of allocating new.
2290static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2291 ArgDescriptor Arg = ArgDescriptor()) {
2292 if (Arg.isSet())
2293 return ArgDescriptor::createArg(Arg, Mask);
2294
2295 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2296 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2297 if (RegIdx == ArgVGPRs.size()) {
2298 // Spill to stack required.
2299 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2300
2301 return ArgDescriptor::createStack(Offset, Mask);
2302 }
2303
2304 unsigned Reg = ArgVGPRs[RegIdx];
2305 Reg = CCInfo.AllocateReg(Reg);
2306 assert(Reg != AMDGPU::NoRegister);
2307
2308 MachineFunction &MF = CCInfo.getMachineFunction();
2309 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2310 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2311 return ArgDescriptor::createRegister(Reg, Mask);
2312}
2313
2315 const TargetRegisterClass *RC,
2316 unsigned NumArgRegs) {
2317 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2318 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2319 if (RegIdx == ArgSGPRs.size())
2320 report_fatal_error("ran out of SGPRs for arguments");
2321
2322 unsigned Reg = ArgSGPRs[RegIdx];
2323 Reg = CCInfo.AllocateReg(Reg);
2324 assert(Reg != AMDGPU::NoRegister);
2325
2326 MachineFunction &MF = CCInfo.getMachineFunction();
2327 MF.addLiveIn(Reg, RC);
2329}
2330
2331// If this has a fixed position, we still should allocate the register in the
2332// CCInfo state. Technically we could get away with this for values passed
2333// outside of the normal argument range.
2335 const TargetRegisterClass *RC,
2336 MCRegister Reg) {
2337 Reg = CCInfo.AllocateReg(Reg);
2338 assert(Reg != AMDGPU::NoRegister);
2339 MachineFunction &MF = CCInfo.getMachineFunction();
2340 MF.addLiveIn(Reg, RC);
2341}
2342
2343static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2344 if (Arg) {
2345 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2346 Arg.getRegister());
2347 } else
2348 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2349}
2350
2351static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2352 if (Arg) {
2353 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2354 Arg.getRegister());
2355 } else
2356 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2357}
2358
2359/// Allocate implicit function VGPR arguments at the end of allocated user
2360/// arguments.
2362 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2363 SIMachineFunctionInfo &Info) const {
2364 const unsigned Mask = 0x3ff;
2365 ArgDescriptor Arg;
2366
2367 if (Info.hasWorkItemIDX()) {
2368 Arg = allocateVGPR32Input(CCInfo, Mask);
2369 Info.setWorkItemIDX(Arg);
2370 }
2371
2372 if (Info.hasWorkItemIDY()) {
2373 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2374 Info.setWorkItemIDY(Arg);
2375 }
2376
2377 if (Info.hasWorkItemIDZ())
2378 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2379}
2380
2381/// Allocate implicit function VGPR arguments in fixed registers.
2383 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2384 SIMachineFunctionInfo &Info) const {
2385 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2386 if (!Reg)
2387 report_fatal_error("failed to allocated VGPR for implicit arguments");
2388
2389 const unsigned Mask = 0x3ff;
2390 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2391 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2392 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2393}
2394
2396 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2397 SIMachineFunctionInfo &Info) const {
2398 auto &ArgInfo = Info.getArgInfo();
2399 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2400
2401 // TODO: Unify handling with private memory pointers.
2402 if (UserSGPRInfo.hasDispatchPtr())
2403 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2404
2405 if (UserSGPRInfo.hasQueuePtr())
2406 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2407
2408 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2409 // constant offset from the kernarg segment.
2410 if (Info.hasImplicitArgPtr())
2411 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2412
2413 if (UserSGPRInfo.hasDispatchID())
2414 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2415
2416 // flat_scratch_init is not applicable for non-kernel functions.
2417
2418 if (Info.hasWorkGroupIDX())
2419 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2420
2421 if (Info.hasWorkGroupIDY())
2422 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2423
2424 if (Info.hasWorkGroupIDZ())
2425 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2426
2427 if (Info.hasLDSKernelId())
2428 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2429}
2430
2431// Allocate special inputs passed in user SGPRs.
2433 MachineFunction &MF,
2434 const SIRegisterInfo &TRI,
2435 SIMachineFunctionInfo &Info) const {
2436 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2437 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2438 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2439 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2440 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2441 }
2442
2443 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2444 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2445 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2446 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2447 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2448 }
2449
2450 if (UserSGPRInfo.hasDispatchPtr()) {
2451 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2452 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2453 CCInfo.AllocateReg(DispatchPtrReg);
2454 }
2455
2456 if (UserSGPRInfo.hasQueuePtr()) {
2457 Register QueuePtrReg = Info.addQueuePtr(TRI);
2458 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2459 CCInfo.AllocateReg(QueuePtrReg);
2460 }
2461
2462 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2464 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2465 CCInfo.AllocateReg(InputPtrReg);
2466
2467 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2468 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2469 }
2470
2471 if (UserSGPRInfo.hasDispatchID()) {
2472 Register DispatchIDReg = Info.addDispatchID(TRI);
2473 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2474 CCInfo.AllocateReg(DispatchIDReg);
2475 }
2476
2477 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2478 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2479 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2480 CCInfo.AllocateReg(FlatScratchInitReg);
2481 }
2482
2483 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2484 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2485 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2486 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2487 }
2488
2489 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2490 // these from the dispatch pointer.
2491}
2492
2493// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2494// sequential starting from the first argument.
2496 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2498 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2499 Function &F = MF.getFunction();
2500 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2501 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2502 bool InPreloadSequence = true;
2503 unsigned InIdx = 0;
2504 bool AlignedForImplictArgs = false;
2505 unsigned ImplicitArgOffset = 0;
2506 for (auto &Arg : F.args()) {
2507 if (!InPreloadSequence || !Arg.hasInRegAttr())
2508 break;
2509
2510 unsigned ArgIdx = Arg.getArgNo();
2511 // Don't preload non-original args or parts not in the current preload
2512 // sequence.
2513 if (InIdx < Ins.size() &&
2514 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2515 break;
2516
2517 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2518 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2519 InIdx++) {
2520 assert(ArgLocs[ArgIdx].isMemLoc());
2521 auto &ArgLoc = ArgLocs[InIdx];
2522 const Align KernelArgBaseAlign = Align(16);
2523 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2524 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2525 unsigned NumAllocSGPRs =
2526 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2527
2528 // Fix alignment for hidden arguments.
2529 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2530 if (!AlignedForImplictArgs) {
2531 ImplicitArgOffset =
2532 alignTo(LastExplicitArgOffset,
2533 Subtarget->getAlignmentForImplicitArgPtr()) -
2534 LastExplicitArgOffset;
2535 AlignedForImplictArgs = true;
2536 }
2537 ArgOffset += ImplicitArgOffset;
2538 }
2539
2540 // Arg is preloaded into the previous SGPR.
2541 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2542 assert(InIdx >= 1 && "No previous SGPR");
2543 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2544 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2545 continue;
2546 }
2547
2548 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2549 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2550 // Check for free user SGPRs for preloading.
2551 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2552 InPreloadSequence = false;
2553 break;
2554 }
2555
2556 // Preload this argument.
2557 const TargetRegisterClass *RC =
2558 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2559 SmallVectorImpl<MCRegister> *PreloadRegs =
2560 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2561
2562 if (PreloadRegs->size() > 1)
2563 RC = &AMDGPU::SGPR_32RegClass;
2564 for (auto &Reg : *PreloadRegs) {
2565 assert(Reg);
2566 MF.addLiveIn(Reg, RC);
2567 CCInfo.AllocateReg(Reg);
2568 }
2569
2570 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2571 }
2572 }
2573}
2574
2576 const SIRegisterInfo &TRI,
2577 SIMachineFunctionInfo &Info) const {
2578 // Always allocate this last since it is a synthetic preload.
2579 if (Info.hasLDSKernelId()) {
2580 Register Reg = Info.addLDSKernelId();
2581 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2582 CCInfo.AllocateReg(Reg);
2583 }
2584}
2585
2586// Allocate special input registers that are initialized per-wave.
2589 CallingConv::ID CallConv,
2590 bool IsShader) const {
2591 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2592 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2593 // Note: user SGPRs are handled by the front-end for graphics shaders
2594 // Pad up the used user SGPRs with dead inputs.
2595
2596 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2597 // before enabling architected SGPRs for workgroup IDs.
2598 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2599
2600 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2601 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2602 // rely on it to reach 16 since if we end up having no stack usage, it will
2603 // not really be added.
2604 unsigned NumRequiredSystemSGPRs =
2605 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2606 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2607 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2608 Register Reg = Info.addReservedUserSGPR();
2609 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2610 CCInfo.AllocateReg(Reg);
2611 }
2612 }
2613
2614 if (!HasArchitectedSGPRs) {
2615 if (Info.hasWorkGroupIDX()) {
2616 Register Reg = Info.addWorkGroupIDX();
2617 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2618 CCInfo.AllocateReg(Reg);
2619 }
2620
2621 if (Info.hasWorkGroupIDY()) {
2622 Register Reg = Info.addWorkGroupIDY();
2623 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2624 CCInfo.AllocateReg(Reg);
2625 }
2626
2627 if (Info.hasWorkGroupIDZ()) {
2628 Register Reg = Info.addWorkGroupIDZ();
2629 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2630 CCInfo.AllocateReg(Reg);
2631 }
2632 }
2633
2634 if (Info.hasWorkGroupInfo()) {
2635 Register Reg = Info.addWorkGroupInfo();
2636 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2637 CCInfo.AllocateReg(Reg);
2638 }
2639
2640 if (Info.hasPrivateSegmentWaveByteOffset()) {
2641 // Scratch wave offset passed in system SGPR.
2642 unsigned PrivateSegmentWaveByteOffsetReg;
2643
2644 if (IsShader) {
2645 PrivateSegmentWaveByteOffsetReg =
2646 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2647
2648 // This is true if the scratch wave byte offset doesn't have a fixed
2649 // location.
2650 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2651 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2652 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2653 }
2654 } else
2655 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2656
2657 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2658 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2659 }
2660
2661 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2662 Info.getNumPreloadedSGPRs() >= 16);
2663}
2664
2666 MachineFunction &MF,
2667 const SIRegisterInfo &TRI,
2668 SIMachineFunctionInfo &Info) {
2669 // Now that we've figured out where the scratch register inputs are, see if
2670 // should reserve the arguments and use them directly.
2671 MachineFrameInfo &MFI = MF.getFrameInfo();
2672 bool HasStackObjects = MFI.hasStackObjects();
2673 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2674
2675 // Record that we know we have non-spill stack objects so we don't need to
2676 // check all stack objects later.
2677 if (HasStackObjects)
2678 Info.setHasNonSpillStackObjects(true);
2679
2680 // Everything live out of a block is spilled with fast regalloc, so it's
2681 // almost certain that spilling will be required.
2682 if (TM.getOptLevel() == CodeGenOptLevel::None)
2683 HasStackObjects = true;
2684
2685 // For now assume stack access is needed in any callee functions, so we need
2686 // the scratch registers to pass in.
2687 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2688
2689 if (!ST.enableFlatScratch()) {
2690 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2691 // If we have stack objects, we unquestionably need the private buffer
2692 // resource. For the Code Object V2 ABI, this will be the first 4 user
2693 // SGPR inputs. We can reserve those and use them directly.
2694
2695 Register PrivateSegmentBufferReg =
2697 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2698 } else {
2699 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2700 // We tentatively reserve the last registers (skipping the last registers
2701 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2702 // we'll replace these with the ones immediately after those which were
2703 // really allocated. In the prologue copies will be inserted from the
2704 // argument to these reserved registers.
2705
2706 // Without HSA, relocations are used for the scratch pointer and the
2707 // buffer resource setup is always inserted in the prologue. Scratch wave
2708 // offset is still in an input SGPR.
2709 Info.setScratchRSrcReg(ReservedBufferReg);
2710 }
2711 }
2712
2714
2715 // For entry functions we have to set up the stack pointer if we use it,
2716 // whereas non-entry functions get this "for free". This means there is no
2717 // intrinsic advantage to using S32 over S34 in cases where we do not have
2718 // calls but do need a frame pointer (i.e. if we are requested to have one
2719 // because frame pointer elimination is disabled). To keep things simple we
2720 // only ever use S32 as the call ABI stack pointer, and so using it does not
2721 // imply we need a separate frame pointer.
2722 //
2723 // Try to use s32 as the SP, but move it if it would interfere with input
2724 // arguments. This won't work with calls though.
2725 //
2726 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2727 // registers.
2728 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2729 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2730 } else {
2732
2733 if (MFI.hasCalls())
2734 report_fatal_error("call in graphics shader with too many input SGPRs");
2735
2736 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2737 if (!MRI.isLiveIn(Reg)) {
2738 Info.setStackPtrOffsetReg(Reg);
2739 break;
2740 }
2741 }
2742
2743 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2744 report_fatal_error("failed to find register for SP");
2745 }
2746
2747 // hasFP should be accurate for entry functions even before the frame is
2748 // finalized, because it does not rely on the known stack size, only
2749 // properties like whether variable sized objects are present.
2750 if (ST.getFrameLowering()->hasFP(MF)) {
2751 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2752 }
2753}
2754
2757 return !Info->isEntryFunction();
2758}
2759
2761
2763 MachineBasicBlock *Entry,
2764 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2766
2767 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2768 if (!IStart)
2769 return;
2770
2771 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2772 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2773 MachineBasicBlock::iterator MBBI = Entry->begin();
2774 for (const MCPhysReg *I = IStart; *I; ++I) {
2775 const TargetRegisterClass *RC = nullptr;
2776 if (AMDGPU::SReg_64RegClass.contains(*I))
2777 RC = &AMDGPU::SGPR_64RegClass;
2778 else if (AMDGPU::SReg_32RegClass.contains(*I))
2779 RC = &AMDGPU::SGPR_32RegClass;
2780 else
2781 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2782
2783 Register NewVR = MRI->createVirtualRegister(RC);
2784 // Create copy from CSR to a virtual register.
2785 Entry->addLiveIn(*I);
2786 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2787 .addReg(*I);
2788
2789 // Insert the copy-back instructions right before the terminator.
2790 for (auto *Exit : Exits)
2791 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2792 TII->get(TargetOpcode::COPY), *I)
2793 .addReg(NewVR);
2794 }
2795}
2796
2798 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2799 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2800 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2802
2804 const Function &Fn = MF.getFunction();
2807
2808 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2809 DiagnosticInfoUnsupported NoGraphicsHSA(
2810 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2811 DAG.getContext()->diagnose(NoGraphicsHSA);
2812 return DAG.getEntryNode();
2813 }
2814
2817 BitVector Skipped(Ins.size());
2818 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2819 *DAG.getContext());
2820
2821 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2822 bool IsKernel = AMDGPU::isKernel(CallConv);
2823 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2824
2825 if (IsGraphics) {
2826 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2827 assert(!UserSGPRInfo.hasDispatchPtr() &&
2828 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2829 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2830 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2831 (void)UserSGPRInfo;
2832 if (!Subtarget->enableFlatScratch())
2833 assert(!UserSGPRInfo.hasFlatScratchInit());
2834 if ((CallConv != CallingConv::AMDGPU_CS &&
2835 CallConv != CallingConv::AMDGPU_Gfx) ||
2836 !Subtarget->hasArchitectedSGPRs())
2837 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2838 !Info->hasWorkGroupIDZ());
2839 }
2840
2841 if (CallConv == CallingConv::AMDGPU_PS) {
2842 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2843
2844 // At least one interpolation mode must be enabled or else the GPU will
2845 // hang.
2846 //
2847 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2848 // set PSInputAddr, the user wants to enable some bits after the compilation
2849 // based on run-time states. Since we can't know what the final PSInputEna
2850 // will look like, so we shouldn't do anything here and the user should take
2851 // responsibility for the correct programming.
2852 //
2853 // Otherwise, the following restrictions apply:
2854 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2855 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2856 // enabled too.
2857 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2858 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2859 CCInfo.AllocateReg(AMDGPU::VGPR0);
2860 CCInfo.AllocateReg(AMDGPU::VGPR1);
2861 Info->markPSInputAllocated(0);
2862 Info->markPSInputEnabled(0);
2863 }
2864 if (Subtarget->isAmdPalOS()) {
2865 // For isAmdPalOS, the user does not enable some bits after compilation
2866 // based on run-time states; the register values being generated here are
2867 // the final ones set in hardware. Therefore we need to apply the
2868 // workaround to PSInputAddr and PSInputEnable together. (The case where
2869 // a bit is set in PSInputAddr but not PSInputEnable is where the
2870 // frontend set up an input arg for a particular interpolation mode, but
2871 // nothing uses that input arg. Really we should have an earlier pass
2872 // that removes such an arg.)
2873 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2874 if ((PsInputBits & 0x7F) == 0 ||
2875 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2876 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2877 }
2878 } else if (IsKernel) {
2879 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2880 } else {
2881 Splits.append(Ins.begin(), Ins.end());
2882 }
2883
2884 if (IsKernel)
2885 analyzeFormalArgumentsCompute(CCInfo, Ins);
2886
2887 if (IsEntryFunc) {
2888 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2889 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2890 if (IsKernel && Subtarget->hasKernargPreload())
2891 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2892
2893 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2894 } else if (!IsGraphics) {
2895 // For the fixed ABI, pass workitem IDs in the last argument register.
2896 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2897
2898 // FIXME: Sink this into allocateSpecialInputSGPRs
2899 if (!Subtarget->enableFlatScratch())
2900 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2901
2902 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2903 }
2904
2905 if (!IsKernel) {
2906 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2907 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2908 }
2909
2911
2912 // FIXME: This is the minimum kernel argument alignment. We should improve
2913 // this to the maximum alignment of the arguments.
2914 //
2915 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2916 // kern arg offset.
2917 const Align KernelArgBaseAlign = Align(16);
2918
2919 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2920 const ISD::InputArg &Arg = Ins[i];
2921 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2922 InVals.push_back(DAG.getUNDEF(Arg.VT));
2923 continue;
2924 }
2925
2926 CCValAssign &VA = ArgLocs[ArgIdx++];
2927 MVT VT = VA.getLocVT();
2928
2929 if (IsEntryFunc && VA.isMemLoc()) {
2930 VT = Ins[i].VT;
2931 EVT MemVT = VA.getLocVT();
2932
2933 const uint64_t Offset = VA.getLocMemOffset();
2934 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2935
2936 if (Arg.Flags.isByRef()) {
2937 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2938
2939 const GCNTargetMachine &TM =
2940 static_cast<const GCNTargetMachine &>(getTargetMachine());
2941 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2942 Arg.Flags.getPointerAddrSpace())) {
2945 }
2946
2947 InVals.push_back(Ptr);
2948 continue;
2949 }
2950
2951 SDValue NewArg;
2952 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2953 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2954 // In this case the argument is packed into the previous preload SGPR.
2955 int64_t AlignDownOffset = alignDown(Offset, 4);
2956 int64_t OffsetDiff = Offset - AlignDownOffset;
2957 EVT IntVT = MemVT.changeTypeToInteger();
2958
2962 Register Reg =
2963 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2964
2965 assert(Reg);
2966 Register VReg = MRI.getLiveInVirtReg(Reg);
2967 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2968
2969 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2970 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2971
2972 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2973 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2974 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2975 Ins[i].Flags.isSExt(), &Ins[i]);
2976
2977 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2978 } else {
2982 const SmallVectorImpl<MCRegister> &PreloadRegs =
2983 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2984
2985 SDValue Copy;
2986 if (PreloadRegs.size() == 1) {
2987 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2988 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2989 NewArg = DAG.getCopyFromReg(
2990 Chain, DL, VReg,
2992 TRI->getRegSizeInBits(*RC)));
2993
2994 } else {
2995 // If the kernarg alignment does not match the alignment of the SGPR
2996 // tuple RC that can accommodate this argument, it will be built up
2997 // via copies from from the individual SGPRs that the argument was
2998 // preloaded to.
3000 for (auto Reg : PreloadRegs) {
3001 Register VReg = MRI.getLiveInVirtReg(Reg);
3002 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3003 Elts.push_back(Copy);
3004 }
3005 NewArg =
3006 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3007 PreloadRegs.size()),
3008 DL, Elts);
3009 }
3010
3011 // If the argument was preloaded to multiple consecutive 32-bit
3012 // registers because of misalignment between addressable SGPR tuples
3013 // and the argument size, we can still assume that because of kernarg
3014 // segment alignment restrictions that NewArg's size is the same as
3015 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3016 // truncate since we cannot preload to less than a single SGPR and the
3017 // MemVT may be smaller.
3018 EVT MemVTInt =
3020 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3021 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3022
3023 NewArg = DAG.getBitcast(MemVT, NewArg);
3024 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3025 Ins[i].Flags.isSExt(), &Ins[i]);
3026 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3027 }
3028 } else {
3029 // Hidden arguments that are in the kernel signature must be preloaded
3030 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3031 // the argument list and is not preloaded.
3032 if (Arg.isOrigArg()) {
3033 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3034 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3035 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3036 *OrigArg->getParent(),
3037 "hidden argument in kernel signature was not preloaded",
3038 DL.getDebugLoc());
3039 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3040 }
3041 }
3042
3043 NewArg =
3044 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3045 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3046 }
3047 Chains.push_back(NewArg.getValue(1));
3048
3049 auto *ParamTy =
3050 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3052 ParamTy &&
3053 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3054 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3055 // On SI local pointers are just offsets into LDS, so they are always
3056 // less than 16-bits. On CI and newer they could potentially be
3057 // real pointers, so we can't guarantee their size.
3058 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3059 DAG.getValueType(MVT::i16));
3060 }
3061
3062 InVals.push_back(NewArg);
3063 continue;
3064 }
3065 if (!IsEntryFunc && VA.isMemLoc()) {
3066 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3067 InVals.push_back(Val);
3068 if (!Arg.Flags.isByVal())
3069 Chains.push_back(Val.getValue(1));
3070 continue;
3071 }
3072
3073 assert(VA.isRegLoc() && "Parameter must be in a register!");
3074
3075 Register Reg = VA.getLocReg();
3076 const TargetRegisterClass *RC = nullptr;
3077 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3078 RC = &AMDGPU::VGPR_32RegClass;
3079 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3080 RC = &AMDGPU::SGPR_32RegClass;
3081 else
3082 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3083 EVT ValVT = VA.getValVT();
3084
3085 Reg = MF.addLiveIn(Reg, RC);
3086 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3087
3088 if (Arg.Flags.isSRet()) {
3089 // The return object should be reasonably addressable.
3090
3091 // FIXME: This helps when the return is a real sret. If it is a
3092 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3093 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3094 unsigned NumBits =
3096 Val = DAG.getNode(
3097 ISD::AssertZext, DL, VT, Val,
3098 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3099 }
3100
3101 // If this is an 8 or 16-bit value, it is really passed promoted
3102 // to 32 bits. Insert an assert[sz]ext to capture this, then
3103 // truncate to the right size.
3104 switch (VA.getLocInfo()) {
3105 case CCValAssign::Full:
3106 break;
3107 case CCValAssign::BCvt:
3108 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3109 break;
3110 case CCValAssign::SExt:
3111 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3112 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3113 break;
3114 case CCValAssign::ZExt:
3115 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3116 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3117 break;
3118 case CCValAssign::AExt:
3119 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3120 break;
3121 default:
3122 llvm_unreachable("Unknown loc info!");
3123 }
3124
3125 InVals.push_back(Val);
3126 }
3127
3128 // Start adding system SGPRs.
3129 if (IsEntryFunc)
3130 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3131
3132 // DAG.getPass() returns nullptr when using new pass manager.
3133 // TODO: Use DAG.getMFAM() to access analysis result.
3134 if (DAG.getPass()) {
3135 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3136 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3137 }
3138
3139 unsigned StackArgSize = CCInfo.getStackSize();
3140 Info->setBytesInStackArgArea(StackArgSize);
3141
3142 return Chains.empty() ? Chain
3143 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3144}
3145
3146// TODO: If return values can't fit in registers, we should return as many as
3147// possible in registers before passing on stack.
3149 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3150 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3151 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3152 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3153 // for shaders. Vector types should be explicitly handled by CC.
3154 if (AMDGPU::isEntryFunctionCC(CallConv))
3155 return true;
3156
3158 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3159 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3160 return false;
3161
3162 // We must use the stack if return would require unavailable registers.
3163 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3164 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3165 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3166 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3167 return false;
3168
3169 return true;
3170}
3171
3172SDValue
3174 bool isVarArg,
3176 const SmallVectorImpl<SDValue> &OutVals,
3177 const SDLoc &DL, SelectionDAG &DAG) const {
3180
3181 if (AMDGPU::isKernel(CallConv)) {
3182 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3183 OutVals, DL, DAG);
3184 }
3185
3186 bool IsShader = AMDGPU::isShader(CallConv);
3187
3188 Info->setIfReturnsVoid(Outs.empty());
3189 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3190
3191 // CCValAssign - represent the assignment of the return value to a location.
3194
3195 // CCState - Info about the registers and stack slots.
3196 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3197 *DAG.getContext());
3198
3199 // Analyze outgoing return values.
3200 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3201
3202 SDValue Glue;
3204 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3205
3206 // Copy the result values into the output registers.
3207 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3208 ++I, ++RealRVLocIdx) {
3209 CCValAssign &VA = RVLocs[I];
3210 assert(VA.isRegLoc() && "Can only return in registers!");
3211 // TODO: Partially return in registers if return values don't fit.
3212 SDValue Arg = OutVals[RealRVLocIdx];
3213
3214 // Copied from other backends.
3215 switch (VA.getLocInfo()) {
3216 case CCValAssign::Full:
3217 break;
3218 case CCValAssign::BCvt:
3219 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3220 break;
3221 case CCValAssign::SExt:
3222 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3223 break;
3224 case CCValAssign::ZExt:
3225 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3226 break;
3227 case CCValAssign::AExt:
3228 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3229 break;
3230 default:
3231 llvm_unreachable("Unknown loc info!");
3232 }
3233
3234 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3235 Glue = Chain.getValue(1);
3236 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3237 }
3238
3239 // FIXME: Does sret work properly?
3240 if (!Info->isEntryFunction()) {
3241 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3242 const MCPhysReg *I =
3243 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3244 if (I) {
3245 for (; *I; ++I) {
3246 if (AMDGPU::SReg_64RegClass.contains(*I))
3247 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3248 else if (AMDGPU::SReg_32RegClass.contains(*I))
3249 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3250 else
3251 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3252 }
3253 }
3254 }
3255
3256 // Update chain and glue.
3257 RetOps[0] = Chain;
3258 if (Glue.getNode())
3259 RetOps.push_back(Glue);
3260
3261 unsigned Opc = AMDGPUISD::ENDPGM;
3262 if (!IsWaveEnd)
3264 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3265}
3266
3268 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3269 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3270 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3271 SDValue ThisVal) const {
3272 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3273
3274 // Assign locations to each value returned by this call.
3276 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3277 *DAG.getContext());
3278 CCInfo.AnalyzeCallResult(Ins, RetCC);
3279
3280 // Copy all of the result registers out of their specified physreg.
3281 for (CCValAssign VA : RVLocs) {
3282 SDValue Val;
3283
3284 if (VA.isRegLoc()) {
3285 Val =
3286 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3287 Chain = Val.getValue(1);
3288 InGlue = Val.getValue(2);
3289 } else if (VA.isMemLoc()) {
3290 report_fatal_error("TODO: return values in memory");
3291 } else
3292 llvm_unreachable("unknown argument location type");
3293
3294 switch (VA.getLocInfo()) {
3295 case CCValAssign::Full:
3296 break;
3297 case CCValAssign::BCvt:
3298 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3299 break;
3300 case CCValAssign::ZExt:
3301 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3302 DAG.getValueType(VA.getValVT()));
3303 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3304 break;
3305 case CCValAssign::SExt:
3306 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3307 DAG.getValueType(VA.getValVT()));
3308 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3309 break;
3310 case CCValAssign::AExt:
3311 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3312 break;
3313 default:
3314 llvm_unreachable("Unknown loc info!");
3315 }
3316
3317 InVals.push_back(Val);
3318 }
3319
3320 return Chain;
3321}
3322
3323// Add code to pass special inputs required depending on used features separate
3324// from the explicit user arguments present in the IR.
3326 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3327 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3328 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3329 // If we don't have a call site, this was a call inserted by
3330 // legalization. These can never use special inputs.
3331 if (!CLI.CB)
3332 return;
3333
3334 SelectionDAG &DAG = CLI.DAG;
3335 const SDLoc &DL = CLI.DL;
3336 const Function &F = DAG.getMachineFunction().getFunction();
3337
3338 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3339 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3340
3341 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3343 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3344 // DAG.getPass() returns nullptr when using new pass manager.
3345 // TODO: Use DAG.getMFAM() to access analysis result.
3346 if (DAG.getPass()) {
3347 auto &ArgUsageInfo =
3349 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3350 }
3351 }
3352
3353 // TODO: Unify with private memory register handling. This is complicated by
3354 // the fact that at least in kernels, the input argument is not necessarily
3355 // in the same location as the input.
3356 // clang-format off
3357 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3359 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3360 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3361 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3362 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3363 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3364 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3365 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3366 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3367 };
3368 // clang-format on
3369
3370 for (auto [InputID, Attr] : ImplicitAttrs) {
3371 // If the callee does not use the attribute value, skip copying the value.
3372 if (CLI.CB->hasFnAttr(Attr))
3373 continue;
3374
3375 const auto [OutgoingArg, ArgRC, ArgTy] =
3376 CalleeArgInfo->getPreloadedValue(InputID);
3377 if (!OutgoingArg)
3378 continue;
3379
3380 const auto [IncomingArg, IncomingArgRC, Ty] =
3381 CallerArgInfo.getPreloadedValue(InputID);
3382 assert(IncomingArgRC == ArgRC);
3383
3384 // All special arguments are ints for now.
3385 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3386 SDValue InputReg;
3387
3388 if (IncomingArg) {
3389 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3390 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3391 // The implicit arg ptr is special because it doesn't have a corresponding
3392 // input for kernels, and is computed from the kernarg segment pointer.
3393 InputReg = getImplicitArgPtr(DAG, DL);
3394 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3395 std::optional<uint32_t> Id =
3397 if (Id.has_value()) {
3398 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3399 } else {
3400 InputReg = DAG.getUNDEF(ArgVT);
3401 }
3402 } else {
3403 // We may have proven the input wasn't needed, although the ABI is
3404 // requiring it. We just need to allocate the register appropriately.
3405 InputReg = DAG.getUNDEF(ArgVT);
3406 }
3407
3408 if (OutgoingArg->isRegister()) {
3409 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3410 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3411 report_fatal_error("failed to allocate implicit input argument");
3412 } else {
3413 unsigned SpecialArgOffset =
3414 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3415 SDValue ArgStore =
3416 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3417 MemOpChains.push_back(ArgStore);
3418 }
3419 }
3420
3421 // Pack workitem IDs into a single register or pass it as is if already
3422 // packed.
3423
3424 auto [OutgoingArg, ArgRC, Ty] =
3426 if (!OutgoingArg)
3427 std::tie(OutgoingArg, ArgRC, Ty) =
3429 if (!OutgoingArg)
3430 std::tie(OutgoingArg, ArgRC, Ty) =
3432 if (!OutgoingArg)
3433 return;
3434
3435 const ArgDescriptor *IncomingArgX = std::get<0>(
3437 const ArgDescriptor *IncomingArgY = std::get<0>(
3439 const ArgDescriptor *IncomingArgZ = std::get<0>(
3441
3442 SDValue InputReg;
3443 SDLoc SL;
3444
3445 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3446 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3447 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3448
3449 // If incoming ids are not packed we need to pack them.
3450 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3451 NeedWorkItemIDX) {
3452 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3453 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3454 } else {
3455 InputReg = DAG.getConstant(0, DL, MVT::i32);
3456 }
3457 }
3458
3459 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3460 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3461 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3462 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3463 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3464 InputReg = InputReg.getNode()
3465 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3466 : Y;
3467 }
3468
3469 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3470 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3471 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3472 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3473 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3474 InputReg = InputReg.getNode()
3475 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3476 : Z;
3477 }
3478
3479 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3480 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3481 // We're in a situation where the outgoing function requires the workitem
3482 // ID, but the calling function does not have it (e.g a graphics function
3483 // calling a C calling convention function). This is illegal, but we need
3484 // to produce something.
3485 InputReg = DAG.getUNDEF(MVT::i32);
3486 } else {
3487 // Workitem ids are already packed, any of present incoming arguments
3488 // will carry all required fields.
3489 ArgDescriptor IncomingArg =
3490 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3491 : IncomingArgY ? *IncomingArgY
3492 : *IncomingArgZ,
3493 ~0u);
3494 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3495 }
3496 }
3497
3498 if (OutgoingArg->isRegister()) {
3499 if (InputReg)
3500 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3501
3502 CCInfo.AllocateReg(OutgoingArg->getRegister());
3503 } else {
3504 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3505 if (InputReg) {
3506 SDValue ArgStore =
3507 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3508 MemOpChains.push_back(ArgStore);
3509 }
3510 }
3511}
3512
3514 return CC == CallingConv::Fast;
3515}
3516
3517/// Return true if we might ever do TCO for calls with this calling convention.
3519 switch (CC) {
3520 case CallingConv::C:
3522 return true;
3523 default:
3524 return canGuaranteeTCO(CC);
3525 }
3526}
3527
3529 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3531 const SmallVectorImpl<SDValue> &OutVals,
3532 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3533 if (AMDGPU::isChainCC(CalleeCC))
3534 return true;
3535
3536 if (!mayTailCallThisCC(CalleeCC))
3537 return false;
3538
3539 // For a divergent call target, we need to do a waterfall loop over the
3540 // possible callees which precludes us from using a simple jump.
3541 if (Callee->isDivergent())
3542 return false;
3543
3545 const Function &CallerF = MF.getFunction();
3546 CallingConv::ID CallerCC = CallerF.getCallingConv();
3548 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3549
3550 // Kernels aren't callable, and don't have a live in return address so it
3551 // doesn't make sense to do a tail call with entry functions.
3552 if (!CallerPreserved)
3553 return false;
3554
3555 bool CCMatch = CallerCC == CalleeCC;
3556
3558 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3559 return true;
3560 return false;
3561 }
3562
3563 // TODO: Can we handle var args?
3564 if (IsVarArg)
3565 return false;
3566
3567 for (const Argument &Arg : CallerF.args()) {
3568 if (Arg.hasByValAttr())
3569 return false;
3570 }
3571
3572 LLVMContext &Ctx = *DAG.getContext();
3573
3574 // Check that the call results are passed in the same way.
3575 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3576 CCAssignFnForCall(CalleeCC, IsVarArg),
3577 CCAssignFnForCall(CallerCC, IsVarArg)))
3578 return false;
3579
3580 // The callee has to preserve all registers the caller needs to preserve.
3581 if (!CCMatch) {
3582 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3583 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3584 return false;
3585 }
3586
3587 // Nothing more to check if the callee is taking no arguments.
3588 if (Outs.empty())
3589 return true;
3590
3592 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3593
3594 // FIXME: We are not allocating special input registers, so we will be
3595 // deciding based on incorrect register assignments.
3596 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3597
3598 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3599 // If the stack arguments for this call do not fit into our own save area then
3600 // the call cannot be made tail.
3601 // TODO: Is this really necessary?
3602 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3603 return false;
3604
3605 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3606 // FIXME: What about inreg arguments that end up passed in memory?
3607 if (!CCVA.isRegLoc())
3608 continue;
3609
3610 // If we are passing an argument in an SGPR, and the value is divergent,
3611 // this call requires a waterfall loop.
3612 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3613 LLVM_DEBUG(
3614 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3615 << printReg(CCVA.getLocReg(), TRI) << '\n');
3616 return false;
3617 }
3618 }
3619
3620 const MachineRegisterInfo &MRI = MF.getRegInfo();
3621 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3622}
3623
3625 if (!CI->isTailCall())
3626 return false;
3627
3628 const Function *ParentFn = CI->getParent()->getParent();
3630 return false;
3631 return true;
3632}
3633
3634// The wave scratch offset register is used as the global base pointer.
3636 SmallVectorImpl<SDValue> &InVals) const {
3637 CallingConv::ID CallConv = CLI.CallConv;
3638 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3639
3640 SelectionDAG &DAG = CLI.DAG;
3641
3642 TargetLowering::ArgListEntry RequestedExec;
3643 if (IsChainCallConv) {
3644 // The last argument should be the value that we need to put in EXEC.
3645 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3646 // don't treat it like the rest of the arguments.
3647 RequestedExec = CLI.Args.back();
3648 assert(RequestedExec.Node && "No node for EXEC");
3649
3650 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3651 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3652
3653 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3654 CLI.Outs.pop_back();
3655 CLI.OutVals.pop_back();
3656
3657 if (RequestedExec.Ty->isIntegerTy(64)) {
3658 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3659 CLI.Outs.pop_back();
3660 CLI.OutVals.pop_back();
3661 }
3662
3663 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3664 "Haven't popped all the pieces of the EXEC mask");
3665 }
3666
3667 const SDLoc &DL = CLI.DL;
3669 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3671 SDValue Chain = CLI.Chain;
3672 SDValue Callee = CLI.Callee;
3673 bool &IsTailCall = CLI.IsTailCall;
3674 bool IsVarArg = CLI.IsVarArg;
3675 bool IsSibCall = false;
3677
3678 if (Callee.isUndef() || isNullConstant(Callee)) {
3679 if (!CLI.IsTailCall) {
3680 for (ISD::InputArg &Arg : CLI.Ins)
3681 InVals.push_back(DAG.getUNDEF(Arg.VT));
3682 }
3683
3684 return Chain;
3685 }
3686
3687 if (IsVarArg) {
3688 return lowerUnhandledCall(CLI, InVals,
3689 "unsupported call to variadic function ");
3690 }
3691
3692 if (!CLI.CB)
3693 report_fatal_error("unsupported libcall legalization");
3694
3695 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3696 return lowerUnhandledCall(CLI, InVals,
3697 "unsupported required tail call to function ");
3698 }
3699
3700 if (IsTailCall) {
3701 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3702 Outs, OutVals, Ins, DAG);
3703 if (!IsTailCall &&
3704 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3705 report_fatal_error("failed to perform tail call elimination on a call "
3706 "site marked musttail or on llvm.amdgcn.cs.chain");
3707 }
3708
3709 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3710
3711 // A sibling call is one where we're under the usual C ABI and not planning
3712 // to change that but can still do a tail call:
3713 if (!TailCallOpt && IsTailCall)
3714 IsSibCall = true;
3715
3716 if (IsTailCall)
3717 ++NumTailCalls;
3718 }
3719
3722 SmallVector<SDValue, 8> MemOpChains;
3723
3724 // Analyze operands of the call, assigning locations to each operand.
3726 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3727 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3728
3729 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3730 // With a fixed ABI, allocate fixed registers before user arguments.
3731 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3732 }
3733
3734 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3735
3736 // Get a count of how many bytes are to be pushed on the stack.
3737 unsigned NumBytes = CCInfo.getStackSize();
3738
3739 if (IsSibCall) {
3740 // Since we're not changing the ABI to make this a tail call, the memory
3741 // operands are already available in the caller's incoming argument space.
3742 NumBytes = 0;
3743 }
3744
3745 // FPDiff is the byte offset of the call's argument area from the callee's.
3746 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3747 // by this amount for a tail call. In a sibling call it must be 0 because the
3748 // caller will deallocate the entire stack and the callee still expects its
3749 // arguments to begin at SP+0. Completely unused for non-tail calls.
3750 int32_t FPDiff = 0;
3751 MachineFrameInfo &MFI = MF.getFrameInfo();
3752 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3753
3754 // Adjust the stack pointer for the new arguments...
3755 // These operations are automatically eliminated by the prolog/epilog pass
3756 if (!IsSibCall)
3757 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3758
3759 if (!IsSibCall || IsChainCallConv) {
3760 if (!Subtarget->enableFlatScratch()) {
3761 SmallVector<SDValue, 4> CopyFromChains;
3762
3763 // In the HSA case, this should be an identity copy.
3764 SDValue ScratchRSrcReg =
3765 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3766 RegsToPass.emplace_back(IsChainCallConv
3767 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3768 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3769 ScratchRSrcReg);
3770 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3771 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3772 }
3773 }
3774
3775 const unsigned NumSpecialInputs = RegsToPass.size();
3776
3777 MVT PtrVT = MVT::i32;
3778
3779 // Walk the register/memloc assignments, inserting copies/loads.
3780 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3781 CCValAssign &VA = ArgLocs[i];
3782 SDValue Arg = OutVals[i];
3783
3784 // Promote the value if needed.
3785 switch (VA.getLocInfo()) {
3786 case CCValAssign::Full:
3787 break;
3788 case CCValAssign::BCvt:
3789 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3790 break;
3791 case CCValAssign::ZExt:
3792 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3793 break;
3794 case CCValAssign::SExt:
3795 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3796 break;
3797 case CCValAssign::AExt:
3798 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3799 break;
3800 case CCValAssign::FPExt:
3801 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3802 break;
3803 default:
3804 llvm_unreachable("Unknown loc info!");
3805 }
3806
3807 if (VA.isRegLoc()) {
3808 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3809 } else {
3810 assert(VA.isMemLoc());
3811
3812 SDValue DstAddr;
3813 MachinePointerInfo DstInfo;
3814
3815 unsigned LocMemOffset = VA.getLocMemOffset();
3816 int32_t Offset = LocMemOffset;
3817
3818 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3819 MaybeAlign Alignment;
3820
3821 if (IsTailCall) {
3822 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3823 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3824 : VA.getValVT().getStoreSize();
3825
3826 // FIXME: We can have better than the minimum byval required alignment.
3827 Alignment =
3828 Flags.isByVal()
3829 ? Flags.getNonZeroByValAlign()
3830 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3831
3832 Offset = Offset + FPDiff;
3833 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3834
3835 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3836 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3837
3838 // Make sure any stack arguments overlapping with where we're storing
3839 // are loaded before this eventual operation. Otherwise they'll be
3840 // clobbered.
3841
3842 // FIXME: Why is this really necessary? This seems to just result in a
3843 // lot of code to copy the stack and write them back to the same
3844 // locations, which are supposed to be immutable?
3845 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3846 } else {
3847 // Stores to the argument stack area are relative to the stack pointer.
3848 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3849 MVT::i32);
3850 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3851 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3852 Alignment =
3853 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3854 }
3855
3856 if (Outs[i].Flags.isByVal()) {
3857 SDValue SizeNode =
3858 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3859 SDValue Cpy =
3860 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3861 Outs[i].Flags.getNonZeroByValAlign(),
3862 /*isVol = */ false, /*AlwaysInline = */ true,
3863 /*CI=*/nullptr, std::nullopt, DstInfo,
3865
3866 MemOpChains.push_back(Cpy);
3867 } else {
3868 SDValue Store =
3869 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3870 MemOpChains.push_back(Store);
3871 }
3872 }
3873 }
3874
3875 if (!MemOpChains.empty())
3876 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3877
3878 SDValue ReadFirstLaneID =
3879 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3880
3881 SDValue TokenGlue;
3882 if (CLI.ConvergenceControlToken) {
3883 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3885 }
3886
3887 // Build a sequence of copy-to-reg nodes chained together with token chain
3888 // and flag operands which copy the outgoing args into the appropriate regs.
3889 SDValue InGlue;
3890
3891 unsigned ArgIdx = 0;
3892 for (auto [Reg, Val] : RegsToPass) {
3893 if (ArgIdx++ >= NumSpecialInputs &&
3894 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3895 // For chain calls, the inreg arguments are required to be
3896 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3897 // they are uniform.
3898 //
3899 // For other calls, if an inreg arguments is known to be uniform,
3900 // speculatively insert a readfirstlane in case it is in a VGPR.
3901 //
3902 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3903 // value, so let that continue to produce invalid code.
3904
3905 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3906 if (TokenGlue)
3907 ReadfirstlaneArgs.push_back(TokenGlue);
3909 ReadfirstlaneArgs);
3910 }
3911
3912 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3913 InGlue = Chain.getValue(1);
3914 }
3915
3916 // We don't usually want to end the call-sequence here because we would tidy
3917 // the frame up *after* the call, however in the ABI-changing tail-call case
3918 // we've carefully laid out the parameters so that when sp is reset they'll be
3919 // in the correct location.
3920 if (IsTailCall && !IsSibCall) {
3921 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3922 InGlue = Chain.getValue(1);
3923 }
3924
3925 std::vector<SDValue> Ops({Chain});
3926
3927 // Add a redundant copy of the callee global which will not be legalized, as
3928 // we need direct access to the callee later.
3929 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3930 const GlobalValue *GV = GSD->getGlobal();
3931 Ops.push_back(Callee);
3932 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3933 } else {
3934 if (IsTailCall) {
3935 // isEligibleForTailCallOptimization considered whether the call target is
3936 // divergent, but we may still end up with a uniform value in a VGPR.
3937 // Insert a readfirstlane just in case.
3938 SDValue ReadFirstLaneID =
3939 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3940
3941 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3942 if (TokenGlue)
3943 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3944 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3945 ReadfirstlaneArgs);
3946 }
3947
3948 Ops.push_back(Callee);
3949 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3950 }
3951
3952 if (IsTailCall) {
3953 // Each tail call may have to adjust the stack by a different amount, so
3954 // this information must travel along with the operation for eventual
3955 // consumption by emitEpilogue.
3956 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3957 }
3958
3959 if (IsChainCallConv)
3960 Ops.push_back(RequestedExec.Node);
3961
3962 // Add argument registers to the end of the list so that they are known live
3963 // into the call.
3964 for (auto &[Reg, Val] : RegsToPass)
3965 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3966
3967 // Add a register mask operand representing the call-preserved registers.
3968 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3969 assert(Mask && "Missing call preserved mask for calling convention");
3970 Ops.push_back(DAG.getRegisterMask(Mask));
3971
3972 if (SDValue Token = CLI.ConvergenceControlToken) {
3974 GlueOps.push_back(Token);
3975 if (InGlue)
3976 GlueOps.push_back(InGlue);
3977
3978 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3979 MVT::Glue, GlueOps),
3980 0);
3981 }
3982
3983 if (InGlue)
3984 Ops.push_back(InGlue);
3985
3986 // If we're doing a tall call, use a TC_RETURN here rather than an
3987 // actual call instruction.
3988 if (IsTailCall) {
3989 MFI.setHasTailCall();
3990 unsigned OPC = AMDGPUISD::TC_RETURN;
3991 switch (CallConv) {
3994 break;
3998 break;
3999 }
4000
4001 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4002 }
4003
4004 // Returns a chain and a flag for retval copy to use.
4005 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4006 Chain = Call.getValue(0);
4007 InGlue = Call.getValue(1);
4008
4009 uint64_t CalleePopBytes = NumBytes;
4010 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4011 if (!Ins.empty())
4012 InGlue = Chain.getValue(1);
4013
4014 // Handle result values, copying them out of physregs into vregs that we
4015 // return.
4016 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4017 InVals, /*IsThisReturn=*/false, SDValue());
4018}
4019
4020// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4021// except for:
4022// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4023// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4025 SelectionDAG &DAG) const {
4026 const MachineFunction &MF = DAG.getMachineFunction();
4028
4029 SDLoc dl(Op);
4030 EVT VT = Op.getValueType();
4031 SDValue Chain = Op.getOperand(0);
4032 Register SPReg = Info->getStackPtrOffsetReg();
4033
4034 // Chain the dynamic stack allocation so that it doesn't modify the stack
4035 // pointer when other instructions are using the stack.
4036 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4037
4038 SDValue Size = Op.getOperand(1);
4039 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4040 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4041
4042 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4044 "Stack grows upwards for AMDGPU");
4045
4046 Chain = BaseAddr.getValue(1);
4047 Align StackAlign = TFL->getStackAlign();
4048 if (Alignment > StackAlign) {
4049 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4050 << Subtarget->getWavefrontSizeLog2();
4051 uint64_t StackAlignMask = ScaledAlignment - 1;
4052 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4053 DAG.getConstant(StackAlignMask, dl, VT));
4054 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4055 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4056 }
4057
4058 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4059 SDValue NewSP;
4060 if (isa<ConstantSDNode>(Size)) {
4061 // For constant sized alloca, scale alloca size by wave-size
4062 SDValue ScaledSize = DAG.getNode(
4063 ISD::SHL, dl, VT, Size,
4064 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4065 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4066 } else {
4067 // For dynamic sized alloca, perform wave-wide reduction to get max of
4068 // alloca size(divergent) and then scale it by wave-size
4069 SDValue WaveReduction =
4070 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4071 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4072 Size, DAG.getConstant(0, dl, MVT::i32));
4073 SDValue ScaledSize = DAG.getNode(
4074 ISD::SHL, dl, VT, Size,
4075 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4076 NewSP =
4077 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4078 SDValue ReadFirstLaneID =
4079 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4080 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4081 NewSP);
4082 }
4083
4084 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4085 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4086
4087 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4088}
4089
4091 if (Op.getValueType() != MVT::i32)
4092 return Op; // Defer to cannot select error.
4093
4095 SDLoc SL(Op);
4096
4097 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4098
4099 // Convert from wave uniform to swizzled vector address. This should protect
4100 // from any edge cases where the stacksave result isn't directly used with
4101 // stackrestore.
4102 SDValue VectorAddress =
4103 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4104 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4105}
4106
4108 SelectionDAG &DAG) const {
4109 SDLoc SL(Op);
4110 assert(Op.getValueType() == MVT::i32);
4111
4112 uint32_t BothRoundHwReg =
4114 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4115
4116 SDValue IntrinID =
4117 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4118 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4119 Op.getOperand(0), IntrinID, GetRoundBothImm);
4120
4121 // There are two rounding modes, one for f32 and one for f64/f16. We only
4122 // report in the standard value range if both are the same.
4123 //
4124 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4125 // ties away from zero is not supported, and the other values are rotated by
4126 // 1.
4127 //
4128 // If the two rounding modes are not the same, report a target defined value.
4129
4130 // Mode register rounding mode fields:
4131 //
4132 // [1:0] Single-precision round mode.
4133 // [3:2] Double/Half-precision round mode.
4134 //
4135 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4136 //
4137 // Hardware Spec
4138 // Toward-0 3 0
4139 // Nearest Even 0 1
4140 // +Inf 1 2
4141 // -Inf 2 3
4142 // NearestAway0 N/A 4
4143 //
4144 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4145 // table we can index by the raw hardware mode.
4146 //
4147 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4148
4149 SDValue BitTable =
4151
4152 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4153 SDValue RoundModeTimesNumBits =
4154 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4155
4156 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4157 // knew only one mode was demanded.
4158 SDValue TableValue =
4159 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4160 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4161
4162 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4163 SDValue TableEntry =
4164 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4165
4166 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4167 // if it's an extended value.
4168 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4169 SDValue IsStandardValue =
4170 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4171 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4172 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4173 TableEntry, EnumOffset);
4174
4175 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4176}
4177
4179 SelectionDAG &DAG) const {
4180 SDLoc SL(Op);
4181
4182 SDValue NewMode = Op.getOperand(1);
4183 assert(NewMode.getValueType() == MVT::i32);
4184
4185 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4186 // hardware MODE.fp_round values.
4187 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4188 uint32_t ClampedVal = std::min(
4189 static_cast<uint32_t>(ConstMode->getZExtValue()),
4191 NewMode = DAG.getConstant(
4192 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4193 } else {
4194 // If we know the input can only be one of the supported standard modes in
4195 // the range 0-3, we can use a simplified mapping to hardware values.
4196 KnownBits KB = DAG.computeKnownBits(NewMode);
4197 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4198 // The supported standard values are 0-3. The extended values start at 8. We
4199 // need to offset by 4 if the value is in the extended range.
4200
4201 if (UseReducedTable) {
4202 // Truncate to the low 32-bits.
4203 SDValue BitTable = DAG.getConstant(
4204 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4205
4206 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4207 SDValue RoundModeTimesNumBits =
4208 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4209
4210 NewMode =
4211 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4212
4213 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4214 // the table extracted bits into inline immediates.
4215 } else {
4216 // table_index = umin(value, value - 4)
4217 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4218 SDValue BitTable =
4220
4221 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4222 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4223 SDValue IndexVal =
4224 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4225
4226 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4227 SDValue RoundModeTimesNumBits =
4228 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4229
4230 SDValue TableValue =
4231 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4232 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4233
4234 // No need to mask out the high bits since the setreg will ignore them
4235 // anyway.
4236 NewMode = TruncTable;
4237 }
4238
4239 // Insert a readfirstlane in case the value is a VGPR. We could do this
4240 // earlier and keep more operations scalar, but that interferes with
4241 // combining the source.
4242 SDValue ReadFirstLaneID =
4243 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4244 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4245 ReadFirstLaneID, NewMode);
4246 }
4247
4248 // N.B. The setreg will be later folded into s_round_mode on supported
4249 // targets.
4250 SDValue IntrinID =
4251 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4252 uint32_t BothRoundHwReg =
4254 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4255
4256 SDValue SetReg =
4257 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4258 IntrinID, RoundBothImm, NewMode);
4259
4260 return SetReg;
4261}
4262
4264 if (Op->isDivergent())
4265 return SDValue();
4266
4267 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4272 break;
4273 default:
4274 return SDValue();
4275 }
4276
4277 return Op;
4278}
4279
4280// Work around DAG legality rules only based on the result type.
4282 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4283 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4284 EVT SrcVT = Src.getValueType();
4285
4286 if (SrcVT.getScalarType() != MVT::bf16)
4287 return Op;
4288
4289 SDLoc SL(Op);
4290 SDValue BitCast =
4291 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4292
4293 EVT DstVT = Op.getValueType();
4294 if (IsStrict)
4295 llvm_unreachable("Need STRICT_BF16_TO_FP");
4296
4297 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4298}
4299
4301 SDLoc SL(Op);
4302 if (Op.getValueType() != MVT::i64)
4303 return Op;
4304
4305 uint32_t ModeHwReg =
4307 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4308 uint32_t TrapHwReg =
4310 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4311
4312 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4313 SDValue IntrinID =
4314 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4315 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4316 Op.getOperand(0), IntrinID, ModeHwRegImm);
4317 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4318 Op.getOperand(0), IntrinID, TrapHwRegImm);
4319 SDValue TokenReg =
4320 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4321 GetTrapReg.getValue(1));
4322
4323 SDValue CvtPtr =
4324 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4325 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4326
4327 return DAG.getMergeValues({Result, TokenReg}, SL);
4328}
4329
4331 SDLoc SL(Op);
4332 if (Op.getOperand(1).getValueType() != MVT::i64)
4333 return Op;
4334
4335 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4336 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4337 DAG.getConstant(0, SL, MVT::i32));
4338 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4339 DAG.getConstant(1, SL, MVT::i32));
4340
4341 SDValue ReadFirstLaneID =
4342 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4343 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4344 ReadFirstLaneID, NewModeReg);
4345 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4346 ReadFirstLaneID, NewTrapReg);
4347
4348 unsigned ModeHwReg =
4350 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4351 unsigned TrapHwReg =
4353 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4354
4355 SDValue IntrinID =
4356 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4357 SDValue SetModeReg =
4358 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4359 IntrinID, ModeHwRegImm, NewModeReg);
4360 SDValue SetTrapReg =
4361 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4362 IntrinID, TrapHwRegImm, NewTrapReg);
4363 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4364}
4365
4367 const MachineFunction &MF) const {
4369 .Case("m0", AMDGPU::M0)
4370 .Case("exec", AMDGPU::EXEC)
4371 .Case("exec_lo", AMDGPU::EXEC_LO)
4372 .Case("exec_hi", AMDGPU::EXEC_HI)
4373 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4374 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4375 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4376 .Default(Register());
4377
4378 if (Reg == AMDGPU::NoRegister) {
4380 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4381 }
4382
4383 if (!Subtarget->hasFlatScrRegister() &&
4384 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4385 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4386 "\" for subtarget."));
4387 }
4388
4389 switch (Reg) {
4390 case AMDGPU::M0:
4391 case AMDGPU::EXEC_LO:
4392 case AMDGPU::EXEC_HI:
4393 case AMDGPU::FLAT_SCR_LO:
4394 case AMDGPU::FLAT_SCR_HI:
4395 if (VT.getSizeInBits() == 32)
4396 return Reg;
4397 break;
4398 case AMDGPU::EXEC:
4399 case AMDGPU::FLAT_SCR:
4400 if (VT.getSizeInBits() == 64)
4401 return Reg;
4402 break;
4403 default:
4404 llvm_unreachable("missing register type checking");
4405 }
4406
4408 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4409}
4410
4411// If kill is not the last instruction, split the block so kill is always a
4412// proper terminator.
4415 MachineBasicBlock *BB) const {
4416 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4418 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4419 return SplitBB;
4420}
4421
4422// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4423// \p MI will be the only instruction in the loop body block. Otherwise, it will
4424// be the first instruction in the remainder block.
4425//
4426/// \returns { LoopBody, Remainder }
4427static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4431
4432 // To insert the loop we need to split the block. Move everything after this
4433 // point to a new block, and insert a new empty block between the two.
4435 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4437 ++MBBI;
4438
4439 MF->insert(MBBI, LoopBB);
4440 MF->insert(MBBI, RemainderBB);
4441
4442 LoopBB->addSuccessor(LoopBB);
4443 LoopBB->addSuccessor(RemainderBB);
4444
4445 // Move the rest of the block into a new block.
4446 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4447
4448 if (InstInLoop) {
4449 auto Next = std::next(I);
4450
4451 // Move instruction to loop body.
4452 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4453
4454 // Move the rest of the block.
4455 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4456 } else {
4457 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4458 }
4459
4460 MBB.addSuccessor(LoopBB);
4461
4462 return std::pair(LoopBB, RemainderBB);
4463}
4464
4465/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4467 MachineBasicBlock *MBB = MI.getParent();
4469 auto I = MI.getIterator();
4470 auto E = std::next(I);
4471
4472 // clang-format off
4473 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4474 .addImm(0);
4475 // clang-format on
4476
4477 MIBundleBuilder Bundler(*MBB, I, E);
4478 finalizeBundle(*MBB, Bundler.begin());
4479}
4480
4483 MachineBasicBlock *BB) const {
4484 const DebugLoc &DL = MI.getDebugLoc();
4485
4487
4489
4490 // Apparently kill flags are only valid if the def is in the same block?
4491 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4492 Src->setIsKill(false);
4493
4494 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4495
4496 MachineBasicBlock::iterator I = LoopBB->end();
4497
4498 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4500
4501 // Clear TRAP_STS.MEM_VIOL
4502 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4503 .addImm(0)
4504 .addImm(EncodedReg);
4505
4507
4508 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4509
4510 // Load and check TRAP_STS.MEM_VIOL
4511 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4512 .addImm(EncodedReg);
4513
4514 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4515 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4516 .addReg(Reg, RegState::Kill)
4517 .addImm(0);
4518 // clang-format off
4519 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4520 .addMBB(LoopBB);
4521 // clang-format on
4522
4523 return RemainderBB;
4524}
4525
4526// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4527// wavefront. If the value is uniform and just happens to be in a VGPR, this
4528// will only do one iteration. In the worst case, this will loop 64 times.
4529//
4530// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4533 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4534 const DebugLoc &DL, const MachineOperand &Idx,
4535 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4536 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4537 Register &SGPRIdxReg) {
4538
4539 MachineFunction *MF = OrigBB.getParent();
4540 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4541 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4543
4544 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4545 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4546 Register NewExec = MRI.createVirtualRegister(BoolRC);
4547 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4548 Register CondReg = MRI.createVirtualRegister(BoolRC);
4549
4550 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4551 .addReg(InitReg)
4552 .addMBB(&OrigBB)
4553 .addReg(ResultReg)
4554 .addMBB(&LoopBB);
4555
4556 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4557 .addReg(InitSaveExecReg)
4558 .addMBB(&OrigBB)
4559 .addReg(NewExec)
4560 .addMBB(&LoopBB);
4561
4562 // Read the next variant <- also loop target.
4563 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4564 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4565
4566 // Compare the just read M0 value to all possible Idx values.
4567 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4568 .addReg(CurrentIdxReg)
4569 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4570
4571 // Update EXEC, save the original EXEC value to VCC.
4572 BuildMI(LoopBB, I, DL,
4573 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4574 : AMDGPU::S_AND_SAVEEXEC_B64),
4575 NewExec)
4576 .addReg(CondReg, RegState::Kill);
4577
4578 MRI.setSimpleHint(NewExec, CondReg);
4579
4580 if (UseGPRIdxMode) {
4581 if (Offset == 0) {
4582 SGPRIdxReg = CurrentIdxReg;
4583 } else {
4584 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4585 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4586 .addReg(CurrentIdxReg, RegState::Kill)
4587 .addImm(Offset);
4588 }
4589 } else {
4590 // Move index from VCC into M0
4591 if (Offset == 0) {
4592 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4593 .addReg(CurrentIdxReg, RegState::Kill);
4594 } else {
4595 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4596 .addReg(CurrentIdxReg, RegState::Kill)
4597 .addImm(Offset);
4598 }
4599 }
4600
4601 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4602 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4603 MachineInstr *InsertPt =
4604 BuildMI(LoopBB, I, DL,
4605 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4606 : AMDGPU::S_XOR_B64_term),
4607 Exec)
4608 .addReg(Exec)
4609 .addReg(NewExec);
4610
4611 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4612 // s_cbranch_scc0?
4613
4614 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4615 // clang-format off
4616 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4617 .addMBB(&LoopBB);
4618 // clang-format on
4619
4620 return InsertPt->getIterator();
4621}
4622
4623// This has slightly sub-optimal regalloc when the source vector is killed by
4624// the read. The register allocator does not understand that the kill is
4625// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4626// subregister from it, using 1 more VGPR than necessary. This was saved when
4627// this was expanded after register allocation.
4630 unsigned InitResultReg, unsigned PhiReg, int Offset,
4631 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4633 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4634 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4636 const DebugLoc &DL = MI.getDebugLoc();
4638
4639 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4640 Register DstReg = MI.getOperand(0).getReg();
4641 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4642 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4643 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4644 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4645
4646 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4647
4648 // Save the EXEC mask
4649 // clang-format off
4650 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4651 .addReg(Exec);
4652 // clang-format on
4653
4654 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4655
4656 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4657
4658 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4659 InitResultReg, DstReg, PhiReg, TmpExec,
4660 Offset, UseGPRIdxMode, SGPRIdxReg);
4661
4662 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4664 ++MBBI;
4665 MF->insert(MBBI, LandingPad);
4666 LoopBB->removeSuccessor(RemainderBB);
4667 LandingPad->addSuccessor(RemainderBB);
4668 LoopBB->addSuccessor(LandingPad);
4669 MachineBasicBlock::iterator First = LandingPad->begin();
4670 // clang-format off
4671 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4672 .addReg(SaveExec);
4673 // clang-format on
4674
4675 return InsPt;
4676}
4677
4678// Returns subreg index, offset
4679static std::pair<unsigned, int>
4681 const TargetRegisterClass *SuperRC, unsigned VecReg,
4682 int Offset) {
4683 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4684
4685 // Skip out of bounds offsets, or else we would end up using an undefined
4686 // register.
4687 if (Offset >= NumElts || Offset < 0)
4688 return std::pair(AMDGPU::sub0, Offset);
4689
4690 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4691}
4692
4695 int Offset) {
4696 MachineBasicBlock *MBB = MI.getParent();
4697 const DebugLoc &DL = MI.getDebugLoc();
4699
4700 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4701
4702 assert(Idx->getReg() != AMDGPU::NoRegister);
4703
4704 if (Offset == 0) {
4705 // clang-format off
4706 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4707 .add(*Idx);
4708 // clang-format on
4709 } else {
4710 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4711 .add(*Idx)
4712 .addImm(Offset);
4713 }
4714}
4715
4718 int Offset) {
4719 MachineBasicBlock *MBB = MI.getParent();
4720 const DebugLoc &DL = MI.getDebugLoc();
4722
4723 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4724
4725 if (Offset == 0)
4726 return Idx->getReg();
4727
4728 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4729 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4730 .add(*Idx)
4731 .addImm(Offset);
4732 return Tmp;
4733}
4734
4737 const GCNSubtarget &ST) {
4738 const SIInstrInfo *TII = ST.getInstrInfo();
4739 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4742
4743 Register Dst = MI.getOperand(0).getReg();
4744 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4745 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4746 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4747
4748 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4749 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4750
4751 unsigned SubReg;
4752 std::tie(SubReg, Offset) =
4753 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4754
4755 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4756
4757 // Check for a SGPR index.
4758 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4760 const DebugLoc &DL = MI.getDebugLoc();
4761
4762 if (UseGPRIdxMode) {
4763 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4764 // to avoid interfering with other uses, so probably requires a new
4765 // optimization pass.
4767
4768 const MCInstrDesc &GPRIDXDesc =
4769 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4770 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4771 .addReg(SrcReg)
4772 .addReg(Idx)
4773 .addImm(SubReg);
4774 } else {
4776
4777 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4778 .addReg(SrcReg, 0, SubReg)
4779 .addReg(SrcReg, RegState::Implicit);
4780 }
4781
4782 MI.eraseFromParent();
4783
4784 return &MBB;
4785 }
4786
4787 // Control flow needs to be inserted if indexing with a VGPR.
4788 const DebugLoc &DL = MI.getDebugLoc();
4790
4791 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4792 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4793
4794 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4795
4796 Register SGPRIdxReg;
4797 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4798 UseGPRIdxMode, SGPRIdxReg);
4799
4800 MachineBasicBlock *LoopBB = InsPt->getParent();
4801
4802 if (UseGPRIdxMode) {
4803 const MCInstrDesc &GPRIDXDesc =
4804 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4805
4806 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4807 .addReg(SrcReg)
4808 .addReg(SGPRIdxReg)
4809 .addImm(SubReg);
4810 } else {
4811 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4812 .addReg(SrcReg, 0, SubReg)
4813 .addReg(SrcReg, RegState::Implicit);
4814 }
4815
4816 MI.eraseFromParent();
4817
4818 return LoopBB;
4819}
4820
4823 const GCNSubtarget &ST) {
4824 const SIInstrInfo *TII = ST.getInstrInfo();
4825 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4828
4829 Register Dst = MI.getOperand(0).getReg();
4830 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4831 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4832 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4833 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4834 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4835 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4836
4837 // This can be an immediate, but will be folded later.
4838 assert(Val->getReg());
4839
4840 unsigned SubReg;
4841 std::tie(SubReg, Offset) =
4842 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4843 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4844
4845 if (Idx->getReg() == AMDGPU::NoRegister) {
4847 const DebugLoc &DL = MI.getDebugLoc();
4848
4849 assert(Offset == 0);
4850
4851 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4852 .add(*SrcVec)
4853 .add(*Val)
4854 .addImm(SubReg);
4855
4856 MI.eraseFromParent();
4857 return &MBB;
4858 }
4859
4860 // Check for a SGPR index.
4861 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4863 const DebugLoc &DL = MI.getDebugLoc();
4864
4865 if (UseGPRIdxMode) {
4867
4868 const MCInstrDesc &GPRIDXDesc =
4869 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4870 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4871 .addReg(SrcVec->getReg())
4872 .add(*Val)
4873 .addReg(Idx)
4874 .addImm(SubReg);
4875 } else {
4877
4878 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4879 TRI.getRegSizeInBits(*VecRC), 32, false);
4880 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4881 .addReg(SrcVec->getReg())
4882 .add(*Val)
4883 .addImm(SubReg);
4884 }
4885 MI.eraseFromParent();
4886 return &MBB;
4887 }
4888
4889 // Control flow needs to be inserted if indexing with a VGPR.
4890 if (Val->isReg())
4891 MRI.clearKillFlags(Val->getReg());
4892
4893 const DebugLoc &DL = MI.getDebugLoc();
4894
4895 Register PhiReg = MRI.createVirtualRegister(VecRC);
4896
4897 Register SGPRIdxReg;
4898 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4899 UseGPRIdxMode, SGPRIdxReg);
4900 MachineBasicBlock *LoopBB = InsPt->getParent();
4901
4902 if (UseGPRIdxMode) {
4903 const MCInstrDesc &GPRIDXDesc =
4904 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4905
4906 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4907 .addReg(PhiReg)
4908 .add(*Val)
4909 .addReg(SGPRIdxReg)
4910 .addImm(SubReg);
4911 } else {
4912 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4913 TRI.getRegSizeInBits(*VecRC), 32, false);
4914 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4915 .addReg(PhiReg)
4916 .add(*Val)
4917 .addImm(SubReg);
4918 }
4919
4920 MI.eraseFromParent();
4921 return LoopBB;
4922}
4923
4926 const GCNSubtarget &ST,
4927 unsigned Opc) {
4929 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4930 const DebugLoc &DL = MI.getDebugLoc();
4931 const SIInstrInfo *TII = ST.getInstrInfo();
4932
4933 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4934 Register SrcReg = MI.getOperand(1).getReg();
4935 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4936 Register DstReg = MI.getOperand(0).getReg();
4937 MachineBasicBlock *RetBB = nullptr;
4938 if (isSGPR) {
4939 // These operations with a uniform value i.e. SGPR are idempotent.
4940 // Reduced value will be same as given sgpr.
4941 // clang-format off
4942 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4943 .addReg(SrcReg);
4944 // clang-format on
4945 RetBB = &BB;
4946 } else {
4947 // TODO: Implement DPP Strategy and switch based on immediate strategy
4948 // operand. For now, for all the cases (default, Iterative and DPP we use
4949 // iterative approach by default.)
4950
4951 // To reduce the VGPR using iterative approach, we need to iterate
4952 // over all the active lanes. Lowering consists of ComputeLoop,
4953 // which iterate over only active lanes. We use copy of EXEC register
4954 // as induction variable and every active lane modifies it using bitset0
4955 // so that we will get the next active lane for next iteration.
4957 Register SrcReg = MI.getOperand(1).getReg();
4958
4959 // Create Control flow for loop
4960 // Split MI's Machine Basic block into For loop
4961 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4962
4963 // Create virtual registers required for lowering.
4964 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4965 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4966 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4967 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4968
4969 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4970 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4971 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4972
4973 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4974 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4975
4976 bool IsWave32 = ST.isWave32();
4977 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4978 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4979
4980 // Create initail values of induction variable from Exec, Accumulator and
4981 // insert branch instr to newly created ComputeBlockk
4982 uint32_t InitalValue =
4983 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4984 auto TmpSReg =
4985 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4986 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4987 .addImm(InitalValue);
4988 // clang-format off
4989 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
4990 .addMBB(ComputeLoop);
4991 // clang-format on
4992
4993 // Start constructing ComputeLoop
4994 I = ComputeLoop->end();
4995 auto Accumulator =
4996 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4997 .addReg(InitalValReg)
4998 .addMBB(&BB);
4999 auto ActiveBits =
5000 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5001 .addReg(TmpSReg->getOperand(0).getReg())
5002 .addMBB(&BB);
5003
5004 // Perform the computations
5005 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5006 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5007 .addReg(ActiveBits->getOperand(0).getReg());
5008 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5009 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5010 .addReg(SrcReg)
5011 .addReg(FF1->getOperand(0).getReg());
5012 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5013 .addReg(Accumulator->getOperand(0).getReg())
5014 .addReg(LaneValue->getOperand(0).getReg());
5015
5016 // Manipulate the iterator to get the next active lane
5017 unsigned BITSETOpc =
5018 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5019 auto NewActiveBits =
5020 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5021 .addReg(FF1->getOperand(0).getReg())
5022 .addReg(ActiveBits->getOperand(0).getReg());
5023
5024 // Add phi nodes
5025 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5026 .addMBB(ComputeLoop);
5027 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5028 .addMBB(ComputeLoop);
5029
5030 // Creating branching
5031 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5032 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5033 .addReg(NewActiveBits->getOperand(0).getReg())
5034 .addImm(0);
5035 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5036 .addMBB(ComputeLoop);
5037
5038 RetBB = ComputeEnd;
5039 }
5040 MI.eraseFromParent();
5041 return RetBB;
5042}
5043
5046 MachineBasicBlock *BB) const {
5047
5049 MachineFunction *MF = BB->getParent();
5051
5052 switch (MI.getOpcode()) {
5053 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5054 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5055 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5056 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5057 case AMDGPU::S_UADDO_PSEUDO:
5058 case AMDGPU::S_USUBO_PSEUDO: {
5059 const DebugLoc &DL = MI.getDebugLoc();
5060 MachineOperand &Dest0 = MI.getOperand(0);
5061 MachineOperand &Dest1 = MI.getOperand(1);
5062 MachineOperand &Src0 = MI.getOperand(2);
5063 MachineOperand &Src1 = MI.getOperand(3);
5064
5065 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5066 ? AMDGPU::S_ADD_I32
5067 : AMDGPU::S_SUB_I32;
5068 // clang-format off
5069 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5070 .add(Src0)
5071 .add(Src1);
5072 // clang-format on
5073
5074 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5075 .addImm(1)
5076 .addImm(0);
5077
5078 MI.eraseFromParent();
5079 return BB;
5080 }
5081 case AMDGPU::S_ADD_U64_PSEUDO:
5082 case AMDGPU::S_SUB_U64_PSEUDO: {
5083 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5084 // For GFX12, we emit s_add_u64 and s_sub_u64.
5085 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5087 const DebugLoc &DL = MI.getDebugLoc();
5088 MachineOperand &Dest = MI.getOperand(0);
5089 MachineOperand &Src0 = MI.getOperand(1);
5090 MachineOperand &Src1 = MI.getOperand(2);
5091 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5092 if (Subtarget->hasScalarAddSub64()) {
5093 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5094 // clang-format off
5095 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5096 .add(Src0)
5097 .add(Src1);
5098 // clang-format on
5099 } else {
5100 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5101 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5102
5103 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5104 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5105
5106 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5107 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5108 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5109 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5110
5111 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5112 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5113 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5114 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5115
5116 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5117 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5118 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5119 .add(Src0Sub0)
5120 .add(Src1Sub0);
5121 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5122 .add(Src0Sub1)
5123 .add(Src1Sub1);
5124 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5125 .addReg(DestSub0)
5126 .addImm(AMDGPU::sub0)
5127 .addReg(DestSub1)
5128 .addImm(AMDGPU::sub1);
5129 }
5130 MI.eraseFromParent();
5131 return BB;
5132 }
5133 case AMDGPU::V_ADD_U64_PSEUDO:
5134 case AMDGPU::V_SUB_U64_PSEUDO: {
5136 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5137 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5138 const DebugLoc &DL = MI.getDebugLoc();
5139
5140 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5141
5142 MachineOperand &Dest = MI.getOperand(0);
5143 MachineOperand &Src0 = MI.getOperand(1);
5144 MachineOperand &Src1 = MI.getOperand(2);
5145
5146 if (IsAdd && ST.hasLshlAddB64()) {
5147 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5148 Dest.getReg())
5149 .add(Src0)
5150 .addImm(0)
5151 .add(Src1);
5152 TII->legalizeOperands(*Add);
5153 MI.eraseFromParent();
5154 return BB;
5155 }
5156
5157 const auto *CarryRC = TRI->getWaveMaskRegClass();
5158
5159 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5160 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5161
5162 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5163 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5164
5165 const TargetRegisterClass *Src0RC = Src0.isReg()
5166 ? MRI.getRegClass(Src0.getReg())
5167 : &AMDGPU::VReg_64RegClass;
5168 const TargetRegisterClass *Src1RC = Src1.isReg()
5169 ? MRI.getRegClass(Src1.getReg())
5170 : &AMDGPU::VReg_64RegClass;
5171
5172 const TargetRegisterClass *Src0SubRC =
5173 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5174 const TargetRegisterClass *Src1SubRC =
5175 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5176
5177 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5178 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5179 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5180 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5181
5182 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5183 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5184 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5185 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5186
5187 unsigned LoOpc =
5188 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5189 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5190 .addReg(CarryReg, RegState::Define)
5191 .add(SrcReg0Sub0)
5192 .add(SrcReg1Sub0)
5193 .addImm(0); // clamp bit
5194
5195 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5196 MachineInstr *HiHalf =
5197 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5198 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5199 .add(SrcReg0Sub1)
5200 .add(SrcReg1Sub1)
5201 .addReg(CarryReg, RegState::Kill)
5202 .addImm(0); // clamp bit
5203
5204 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5205 .addReg(DestSub0)
5206 .addImm(AMDGPU::sub0)
5207 .addReg(DestSub1)
5208 .addImm(AMDGPU::sub1);
5209 TII->legalizeOperands(*LoHalf);
5210 TII->legalizeOperands(*HiHalf);
5211 MI.eraseFromParent();
5212 return BB;
5213 }
5214 case AMDGPU::S_ADD_CO_PSEUDO:
5215 case AMDGPU::S_SUB_CO_PSEUDO: {
5216 // This pseudo has a chance to be selected
5217 // only from uniform add/subcarry node. All the VGPR operands
5218 // therefore assumed to be splat vectors.
5220 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5221 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5223 const DebugLoc &DL = MI.getDebugLoc();
5224 MachineOperand &Dest = MI.getOperand(0);
5225 MachineOperand &CarryDest = MI.getOperand(1);
5226 MachineOperand &Src0 = MI.getOperand(2);
5227 MachineOperand &Src1 = MI.getOperand(3);
5228 MachineOperand &Src2 = MI.getOperand(4);
5229 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5230 ? AMDGPU::S_ADDC_U32
5231 : AMDGPU::S_SUBB_U32;
5232 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5233 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5234 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5235 .addReg(Src0.getReg());
5236 Src0.setReg(RegOp0);
5237 }
5238 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5239 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5241 .addReg(Src1.getReg());
5242 Src1.setReg(RegOp1);
5243 }
5244 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5245 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5246 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5247 .addReg(Src2.getReg());
5248 Src2.setReg(RegOp2);
5249 }
5250
5251 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5252 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5253 assert(WaveSize == 64 || WaveSize == 32);
5254
5255 if (WaveSize == 64) {
5256 if (ST.hasScalarCompareEq64()) {
5257 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5258 .addReg(Src2.getReg())
5259 .addImm(0);
5260 } else {
5261 const TargetRegisterClass *SubRC =
5262 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5263 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5264 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5265 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5266 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5267 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5268
5269 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5270 .add(Src2Sub0)
5271 .add(Src2Sub1);
5272
5273 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5274 .addReg(Src2_32, RegState::Kill)
5275 .addImm(0);
5276 }
5277 } else {
5278 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5279 .addReg(Src2.getReg())
5280 .addImm(0);
5281 }
5282
5283 // clang-format off
5284 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5285 .add(Src0)
5286 .add(Src1);
5287 // clang-format on
5288
5289 unsigned SelOpc =
5290 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5291
5292 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5293 .addImm(-1)
5294 .addImm(0);
5295
5296 MI.eraseFromParent();
5297 return BB;
5298 }
5299 case AMDGPU::SI_INIT_M0: {
5300 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5301 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5302 .add(MI.getOperand(0));
5303 MI.eraseFromParent();
5304 return BB;
5305 }
5306 case AMDGPU::GET_GROUPSTATICSIZE: {
5307 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5308 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5309 DebugLoc DL = MI.getDebugLoc();
5310 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5311 .add(MI.getOperand(0))
5312 .addImm(MFI->getLDSSize());
5313 MI.eraseFromParent();
5314 return BB;
5315 }
5316 case AMDGPU::GET_SHADERCYCLESHILO: {
5319 const DebugLoc &DL = MI.getDebugLoc();
5320 // The algorithm is:
5321 //
5322 // hi1 = getreg(SHADER_CYCLES_HI)
5323 // lo1 = getreg(SHADER_CYCLES_LO)
5324 // hi2 = getreg(SHADER_CYCLES_HI)
5325 //
5326 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5327 // Otherwise there was overflow and the result is hi2:0. In both cases the
5328 // result should represent the actual time at some point during the sequence
5329 // of three getregs.
5330 using namespace AMDGPU::Hwreg;
5331 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5332 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5333 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5334 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5335 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5336 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5337 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5338 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5339 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5340 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5341 .addReg(RegHi1)
5342 .addReg(RegHi2);
5343 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5344 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5345 .addReg(RegLo1)
5346 .addImm(0);
5347 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5348 .add(MI.getOperand(0))
5349 .addReg(RegLo)
5350 .addImm(AMDGPU::sub0)
5351 .addReg(RegHi2)
5352 .addImm(AMDGPU::sub1);
5353 MI.eraseFromParent();
5354 return BB;
5355 }
5356 case AMDGPU::SI_INDIRECT_SRC_V1:
5357 case AMDGPU::SI_INDIRECT_SRC_V2:
5358 case AMDGPU::SI_INDIRECT_SRC_V4:
5359 case AMDGPU::SI_INDIRECT_SRC_V8:
5360 case AMDGPU::SI_INDIRECT_SRC_V9:
5361 case AMDGPU::SI_INDIRECT_SRC_V10:
5362 case AMDGPU::SI_INDIRECT_SRC_V11:
5363 case AMDGPU::SI_INDIRECT_SRC_V12:
5364 case AMDGPU::SI_INDIRECT_SRC_V16:
5365 case AMDGPU::SI_INDIRECT_SRC_V32:
5366 return emitIndirectSrc(MI, *BB, *getSubtarget());
5367 case AMDGPU::SI_INDIRECT_DST_V1:
5368 case AMDGPU::SI_INDIRECT_DST_V2:
5369 case AMDGPU::SI_INDIRECT_DST_V4:
5370 case AMDGPU::SI_INDIRECT_DST_V8:
5371 case AMDGPU::SI_INDIRECT_DST_V9:
5372 case AMDGPU::SI_INDIRECT_DST_V10:
5373 case AMDGPU::SI_INDIRECT_DST_V11:
5374 case AMDGPU::SI_INDIRECT_DST_V12:
5375 case AMDGPU::SI_INDIRECT_DST_V16:
5376 case AMDGPU::SI_INDIRECT_DST_V32:
5377 return emitIndirectDst(MI, *BB, *getSubtarget());
5378 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5379 case AMDGPU::SI_KILL_I1_PSEUDO:
5380 return splitKillBlock(MI, BB);
5381 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5383 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5384 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5385
5386 Register Dst = MI.getOperand(0).getReg();
5387 const MachineOperand &Src0 = MI.getOperand(1);
5388 const MachineOperand &Src1 = MI.getOperand(2);
5389 const DebugLoc &DL = MI.getDebugLoc();
5390 Register SrcCond = MI.getOperand(3).getReg();
5391
5392 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5393 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5394 const auto *CondRC = TRI->getWaveMaskRegClass();
5395 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5396
5397 const TargetRegisterClass *Src0RC = Src0.isReg()
5398 ? MRI.getRegClass(Src0.getReg())
5399 : &AMDGPU::VReg_64RegClass;
5400 const TargetRegisterClass *Src1RC = Src1.isReg()
5401 ? MRI.getRegClass(Src1.getReg())
5402 : &AMDGPU::VReg_64RegClass;
5403
5404 const TargetRegisterClass *Src0SubRC =
5405 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5406 const TargetRegisterClass *Src1SubRC =
5407 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5408
5409 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5410 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5411 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5412 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5413
5414 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5415 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5416 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5417 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5418
5419 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5420 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5421 .addImm(0)
5422 .add(Src0Sub0)
5423 .addImm(0)
5424 .add(Src1Sub0)
5425 .addReg(SrcCondCopy);
5426 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5427 .addImm(0)
5428 .add(Src0Sub1)
5429 .addImm(0)
5430 .add(Src1Sub1)
5431 .addReg(SrcCondCopy);
5432
5433 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5434 .addReg(DstLo)
5435 .addImm(AMDGPU::sub0)
5436 .addReg(DstHi)
5437 .addImm(AMDGPU::sub1);
5438 MI.eraseFromParent();
5439 return BB;
5440 }
5441 case AMDGPU::SI_BR_UNDEF: {
5443 const DebugLoc &DL = MI.getDebugLoc();
5444 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5445 .add(MI.getOperand(0));
5446 Br->getOperand(1).setIsUndef(); // read undef SCC
5447 MI.eraseFromParent();
5448 return BB;
5449 }
5450 case AMDGPU::ADJCALLSTACKUP:
5451 case AMDGPU::ADJCALLSTACKDOWN: {
5453 MachineInstrBuilder MIB(*MF, &MI);
5454 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5455 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5456 return BB;
5457 }
5458 case AMDGPU::SI_CALL_ISEL: {
5460 const DebugLoc &DL = MI.getDebugLoc();
5461
5462 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5463
5465 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5466
5467 for (const MachineOperand &MO : MI.operands())
5468 MIB.add(MO);
5469
5470 MIB.cloneMemRefs(MI);
5471 MI.eraseFromParent();
5472 return BB;
5473 }
5474 case AMDGPU::V_ADD_CO_U32_e32:
5475 case AMDGPU::V_SUB_CO_U32_e32:
5476 case AMDGPU::V_SUBREV_CO_U32_e32: {
5477 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5478 const DebugLoc &DL = MI.getDebugLoc();
5479 unsigned Opc = MI.getOpcode();
5480
5481 bool NeedClampOperand = false;
5482 if (TII->pseudoToMCOpcode(Opc) == -1) {
5483 Opc = AMDGPU::getVOPe64(Opc);
5484 NeedClampOperand = true;
5485 }
5486
5487 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5488 if (TII->isVOP3(*I)) {
5489 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5490 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5491 I.addReg(TRI->getVCC(), RegState::Define);
5492 }
5493 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5494 if (NeedClampOperand)
5495 I.addImm(0); // clamp bit for e64 encoding
5496
5497 TII->legalizeOperands(*I);
5498
5499 MI.eraseFromParent();
5500 return BB;
5501 }
5502 case AMDGPU::V_ADDC_U32_e32:
5503 case AMDGPU::V_SUBB_U32_e32:
5504 case AMDGPU::V_SUBBREV_U32_e32:
5505 // These instructions have an implicit use of vcc which counts towards the
5506 // constant bus limit.
5507 TII->legalizeOperands(MI);
5508 return BB;
5509 case AMDGPU::DS_GWS_INIT:
5510 case AMDGPU::DS_GWS_SEMA_BR:
5511 case AMDGPU::DS_GWS_BARRIER:
5512 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5513 [[fallthrough]];
5514 case AMDGPU::DS_GWS_SEMA_V:
5515 case AMDGPU::DS_GWS_SEMA_P:
5516 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5517 // A s_waitcnt 0 is required to be the instruction immediately following.
5518 if (getSubtarget()->hasGWSAutoReplay()) {
5520 return BB;
5521 }
5522
5523 return emitGWSMemViolTestLoop(MI, BB);
5524 case AMDGPU::S_SETREG_B32: {
5525 // Try to optimize cases that only set the denormal mode or rounding mode.
5526 //
5527 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5528 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5529 // instead.
5530 //
5531 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5532 // allow you to have a no side effect instruction in the output of a
5533 // sideeffecting pattern.
5534 auto [ID, Offset, Width] =
5535 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5537 return BB;
5538
5539 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5540 const unsigned SetMask = WidthMask << Offset;
5541
5542 if (getSubtarget()->hasDenormModeInst()) {
5543 unsigned SetDenormOp = 0;
5544 unsigned SetRoundOp = 0;
5545
5546 // The dedicated instructions can only set the whole denorm or round mode
5547 // at once, not a subset of bits in either.
5548 if (SetMask ==
5550 // If this fully sets both the round and denorm mode, emit the two
5551 // dedicated instructions for these.
5552 SetRoundOp = AMDGPU::S_ROUND_MODE;
5553 SetDenormOp = AMDGPU::S_DENORM_MODE;
5554 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5555 SetRoundOp = AMDGPU::S_ROUND_MODE;
5556 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5557 SetDenormOp = AMDGPU::S_DENORM_MODE;
5558 }
5559
5560 if (SetRoundOp || SetDenormOp) {
5562 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5563 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5564 unsigned ImmVal = Def->getOperand(1).getImm();
5565 if (SetRoundOp) {
5566 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5567 .addImm(ImmVal & 0xf);
5568
5569 // If we also have the denorm mode, get just the denorm mode bits.
5570 ImmVal >>= 4;
5571 }
5572
5573 if (SetDenormOp) {
5574 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5575 .addImm(ImmVal & 0xf);
5576 }
5577
5578 MI.eraseFromParent();
5579 return BB;
5580 }
5581 }
5582 }
5583
5584 // If only FP bits are touched, used the no side effects pseudo.
5585 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5586 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5587 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5588
5589 return BB;
5590 }
5591 case AMDGPU::S_INVERSE_BALLOT_U32:
5592 case AMDGPU::S_INVERSE_BALLOT_U64:
5593 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5594 // necessary. After that they are equivalent to a COPY.
5595 MI.setDesc(TII->get(AMDGPU::COPY));
5596 return BB;
5597 case AMDGPU::ENDPGM_TRAP: {
5598 const DebugLoc &DL = MI.getDebugLoc();
5599 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5600 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5601 MI.addOperand(MachineOperand::CreateImm(0));
5602 return BB;
5603 }
5604
5605 // We need a block split to make the real endpgm a terminator. We also don't
5606 // want to break phis in successor blocks, so we can't just delete to the
5607 // end of the block.
5608
5609 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5611 MF->push_back(TrapBB);
5612 // clang-format off
5613 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5614 .addImm(0);
5615 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5616 .addMBB(TrapBB);
5617 // clang-format on
5618
5619 BB->addSuccessor(TrapBB);
5620 MI.eraseFromParent();
5621 return SplitBB;
5622 }
5623 case AMDGPU::SIMULATED_TRAP: {
5624 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5626 MachineBasicBlock *SplitBB =
5627 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5628 MI.eraseFromParent();
5629 return SplitBB;
5630 }
5631 default:
5632 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5633 if (!MI.mayStore())
5635 return BB;
5636 }
5638 }
5639}
5640
5642 // This currently forces unfolding various combinations of fsub into fma with
5643 // free fneg'd operands. As long as we have fast FMA (controlled by
5644 // isFMAFasterThanFMulAndFAdd), we should perform these.
5645
5646 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5647 // most of these combines appear to be cycle neutral but save on instruction
5648 // count / code size.
5649 return true;
5650}
5651
5653
5655 EVT VT) const {
5656 if (!VT.isVector()) {
5657 return MVT::i1;
5658 }
5659 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5660}
5661
5663 // TODO: Should i16 be used always if legal? For now it would force VALU
5664 // shifts.
5665 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5666}
5667
5669 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5670 ? Ty.changeElementSize(16)
5671 : Ty.changeElementSize(32);
5672}
5673
5674// Answering this is somewhat tricky and depends on the specific device which
5675// have different rates for fma or all f64 operations.
5676//
5677// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5678// regardless of which device (although the number of cycles differs between
5679// devices), so it is always profitable for f64.
5680//
5681// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5682// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5683// which we can always do even without fused FP ops since it returns the same
5684// result as the separate operations and since it is always full
5685// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5686// however does not support denormals, so we do report fma as faster if we have
5687// a fast fma device and require denormals.
5688//
5690 EVT VT) const {
5691 VT = VT.getScalarType();
5692
5693 switch (VT.getSimpleVT().SimpleTy) {
5694 case MVT::f32: {
5695 // If mad is not available this depends only on if f32 fma is full rate.
5696 if (!Subtarget->hasMadMacF32Insts())
5697 return Subtarget->hasFastFMAF32();
5698
5699 // Otherwise f32 mad is always full rate and returns the same result as
5700 // the separate operations so should be preferred over fma.
5701 // However does not support denormals.
5703 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5704
5705 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5706 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5707 }
5708 case MVT::f64:
5709 return true;
5710 case MVT::f16:
5711 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5712 default:
5713 break;
5714 }
5715
5716 return false;
5717}
5718
5720 LLT Ty) const {
5721 switch (Ty.getScalarSizeInBits()) {
5722 case 16:
5723 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5724 case 32:
5725 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5726 case 64:
5727 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5728 default:
5729 break;
5730 }
5731
5732 return false;
5733}
5734
5735// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5736// specific details.
5738 Type *Ty) const {
5739 switch (Ty->getScalarSizeInBits()) {
5740 case 16: {
5742 return Subtarget->has16BitInsts() &&
5743 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5744 }
5745 case 32: {
5746 if (!Subtarget->hasMadMacF32Insts())
5747 return Subtarget->hasFastFMAF32();
5748
5750 if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5751 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5752
5753 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5754 }
5755 case 64:
5756 return true;
5757 default:
5758 break;
5759 }
5760
5761 return false;
5762}
5763
5765 if (!Ty.isScalar())
5766 return false;
5767
5768 if (Ty.getScalarSizeInBits() == 16)
5769 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5770 if (Ty.getScalarSizeInBits() == 32)
5771 return Subtarget->hasMadMacF32Insts() &&
5772 denormalModeIsFlushAllF32(*MI.getMF());
5773
5774 return false;
5775}
5776
5778 const SDNode *N) const {
5779 // TODO: Check future ftz flag
5780 // v_mad_f32/v_mac_f32 do not support denormals.
5781 EVT VT = N->getValueType(0);
5782 if (VT == MVT::f32)
5783 return Subtarget->hasMadMacF32Insts() &&
5785 if (VT == MVT::f16) {
5786 return Subtarget->hasMadF16() &&
5788 }
5789
5790 return false;
5791}
5792
5793//===----------------------------------------------------------------------===//
5794// Custom DAG Lowering Operations
5795//===----------------------------------------------------------------------===//
5796
5797// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5798// wider vector type is legal.
5800 SelectionDAG &DAG) const {
5801 unsigned Opc = Op.getOpcode();
5802 EVT VT = Op.getValueType();
5803 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5804 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5805 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5806 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5807
5808 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5809
5810 SDLoc SL(Op);
5811 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5812 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5813
5814 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5815}
5816
5817// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5818// wider vector type is legal.
5820 SelectionDAG &DAG) const {
5821 unsigned Opc = Op.getOpcode();
5822 EVT VT = Op.getValueType();
5823 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5824 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5825 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5826 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5827
5828 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5829 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5830
5831 SDLoc SL(Op);
5832
5833 SDValue OpLo =
5834 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5835 SDValue OpHi =
5836 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5837
5838 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5839}
5840
5842 SelectionDAG &DAG) const {
5843 unsigned Opc = Op.getOpcode();
5844 EVT VT = Op.getValueType();
5845 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5846 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5847 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5848 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5849 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5850 VT == MVT::v32bf16);
5851
5852 SDValue Op0 = Op.getOperand(0);
5853 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5854 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5855 : std::pair(Op0, Op0);
5856
5857 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5858 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5859
5860 SDLoc SL(Op);
5861 auto ResVT = DAG.GetSplitDestVTs(VT);
5862
5863 SDValue OpLo =
5864 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5865 SDValue OpHi =
5866 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5867
5868 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5869}
5870
5872 switch (Op.getOpcode()) {
5873 default:
5875 case ISD::BRCOND:
5876 return LowerBRCOND(Op, DAG);
5877 case ISD::RETURNADDR:
5878 return LowerRETURNADDR(Op, DAG);
5879 case ISD::LOAD: {
5880 SDValue Result = LowerLOAD(Op, DAG);
5881 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5882 "Load should return a value and a chain");
5883 return Result;
5884 }
5885 case ISD::FSQRT: {
5886 EVT VT = Op.getValueType();
5887 if (VT == MVT::f32)
5888 return lowerFSQRTF32(Op, DAG);
5889 if (VT == MVT::f64)
5890 return lowerFSQRTF64(Op, DAG);
5891 return SDValue();
5892 }
5893 case ISD::FSIN:
5894 case ISD::FCOS:
5895 return LowerTrig(Op, DAG);
5896 case ISD::SELECT:
5897 return LowerSELECT(Op, DAG);
5898 case ISD::FDIV:
5899 return LowerFDIV(Op, DAG);
5900 case ISD::FFREXP:
5901 return LowerFFREXP(Op, DAG);
5903 return LowerATOMIC_CMP_SWAP(Op, DAG);
5904 case ISD::STORE:
5905 return LowerSTORE(Op, DAG);
5906 case ISD::GlobalAddress: {
5909 return LowerGlobalAddress(MFI, Op, DAG);
5910 }
5912 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5914 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5916 return LowerINTRINSIC_VOID(Op, DAG);
5917 case ISD::ADDRSPACECAST:
5918 return lowerADDRSPACECAST(Op, DAG);
5920 return lowerINSERT_SUBVECTOR(Op, DAG);
5922 return lowerINSERT_VECTOR_ELT(Op, DAG);
5924 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5926 return lowerVECTOR_SHUFFLE(Op, DAG);
5928 return lowerSCALAR_TO_VECTOR(Op, DAG);
5929 case ISD::BUILD_VECTOR:
5930 return lowerBUILD_VECTOR(Op, DAG);
5931 case ISD::FP_ROUND:
5933 return lowerFP_ROUND(Op, DAG);
5934 case ISD::TRAP:
5935 return lowerTRAP(Op, DAG);
5936 case ISD::DEBUGTRAP:
5937 return lowerDEBUGTRAP(Op, DAG);
5938 case ISD::ABS:
5939 case ISD::FABS:
5940 case ISD::FNEG:
5941 case ISD::FCANONICALIZE:
5942 case ISD::BSWAP:
5943 return splitUnaryVectorOp(Op, DAG);
5944 case ISD::FMINNUM:
5945 case ISD::FMAXNUM:
5946 return lowerFMINNUM_FMAXNUM(Op, DAG);
5947 case ISD::FLDEXP:
5948 case ISD::STRICT_FLDEXP:
5949 return lowerFLDEXP(Op, DAG);
5950 case ISD::FMA:
5951 return splitTernaryVectorOp(Op, DAG);
5952 case ISD::FP_TO_SINT:
5953 case ISD::FP_TO_UINT:
5954 return LowerFP_TO_INT(Op, DAG);
5955 case ISD::SHL:
5956 case ISD::SRA:
5957 case ISD::SRL:
5958 case ISD::ADD:
5959 case ISD::SUB:
5960 case ISD::SMIN:
5961 case ISD::SMAX:
5962 case ISD::UMIN:
5963 case ISD::UMAX:
5964 case ISD::FADD:
5965 case ISD::FMUL:
5966 case ISD::FMINNUM_IEEE:
5967 case ISD::FMAXNUM_IEEE:
5968 case ISD::FMINIMUM:
5969 case ISD::FMAXIMUM:
5970 case ISD::FMINIMUMNUM:
5971 case ISD::FMAXIMUMNUM:
5972 case ISD::UADDSAT:
5973 case ISD::USUBSAT:
5974 case ISD::SADDSAT:
5975 case ISD::SSUBSAT:
5976 return splitBinaryVectorOp(Op, DAG);
5977 case ISD::MUL:
5978 return lowerMUL(Op, DAG);
5979 case ISD::SMULO:
5980 case ISD::UMULO:
5981 return lowerXMULO(Op, DAG);
5982 case ISD::SMUL_LOHI:
5983 case ISD::UMUL_LOHI:
5984 return lowerXMUL_LOHI(Op, DAG);
5986 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5987 case ISD::STACKSAVE:
5988 return LowerSTACKSAVE(Op, DAG);
5989 case ISD::GET_ROUNDING:
5990 return lowerGET_ROUNDING(Op, DAG);
5991 case ISD::SET_ROUNDING:
5992 return lowerSET_ROUNDING(Op, DAG);
5993 case ISD::PREFETCH:
5994 return lowerPREFETCH(Op, DAG);
5995 case ISD::FP_EXTEND:
5997 return lowerFP_EXTEND(Op, DAG);
5998 case ISD::GET_FPENV:
5999 return lowerGET_FPENV(Op, DAG);
6000 case ISD::SET_FPENV:
6001 return lowerSET_FPENV(Op, DAG);
6002 }
6003 return SDValue();
6004}
6005
6006// Used for D16: Casts the result of an instruction into the right vector,
6007// packs values if loads return unpacked values.
6009 const SDLoc &DL, SelectionDAG &DAG,
6010 bool Unpacked) {
6011 if (!LoadVT.isVector())
6012 return Result;
6013
6014 // Cast back to the original packed type or to a larger type that is a
6015 // multiple of 32 bit for D16. Widening the return type is a required for
6016 // legalization.
6017 EVT FittingLoadVT = LoadVT;
6018 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6019 FittingLoadVT =
6021 LoadVT.getVectorNumElements() + 1);
6022 }
6023
6024 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6025 // Truncate to v2i16/v4i16.
6026 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6027
6028 // Workaround legalizer not scalarizing truncate after vector op
6029 // legalization but not creating intermediate vector trunc.
6031 DAG.ExtractVectorElements(Result, Elts);
6032 for (SDValue &Elt : Elts)
6033 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6034
6035 // Pad illegal v1i16/v3fi6 to v4i16
6036 if ((LoadVT.getVectorNumElements() % 2) == 1)
6037 Elts.push_back(DAG.getUNDEF(MVT::i16));
6038
6039 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6040
6041 // Bitcast to original type (v2f16/v4f16).
6042 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6043 }
6044
6045 // Cast back to the original packed type.
6046 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6047}
6048
6049SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6050 SelectionDAG &DAG,
6052 bool IsIntrinsic) const {
6053 SDLoc DL(M);
6054
6055 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6056 EVT LoadVT = M->getValueType(0);
6057
6058 EVT EquivLoadVT = LoadVT;
6059 if (LoadVT.isVector()) {
6060 if (Unpacked) {
6061 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6062 LoadVT.getVectorNumElements());
6063 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6064 // Widen v3f16 to legal type
6065 EquivLoadVT =
6067 LoadVT.getVectorNumElements() + 1);
6068 }
6069 }
6070
6071 // Change from v4f16/v2f16 to EquivLoadVT.
6072 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6073
6075 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6076 M->getMemoryVT(), M->getMemOperand());
6077
6078 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6079
6080 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6081}
6082
6083SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6084 SelectionDAG &DAG,
6085 ArrayRef<SDValue> Ops) const {
6086 SDLoc DL(M);
6087 EVT LoadVT = M->getValueType(0);
6088 EVT EltType = LoadVT.getScalarType();
6089 EVT IntVT = LoadVT.changeTypeToInteger();
6090
6091 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6092
6093 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6094 bool IsTFE = M->getNumValues() == 3;
6095
6096 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6100
6101 if (IsD16) {
6102 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6103 }
6104
6105 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6106 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6107 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6108 IsTFE);
6109
6110 if (isTypeLegal(LoadVT)) {
6111 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6112 M->getMemOperand(), DAG);
6113 }
6114
6115 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6116 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6117 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6118 M->getMemOperand(), DAG);
6119 return DAG.getMergeValues(
6120 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6121 DL);
6122}
6123
6125 SelectionDAG &DAG) {
6126 EVT VT = N->getValueType(0);
6127 unsigned CondCode = N->getConstantOperandVal(3);
6128 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6129 return DAG.getUNDEF(VT);
6130
6131 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6132
6133 SDValue LHS = N->getOperand(1);
6134 SDValue RHS = N->getOperand(2);
6135
6136 SDLoc DL(N);
6137
6138 EVT CmpVT = LHS.getValueType();
6139 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6140 unsigned PromoteOp =
6142 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6143 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6144 }
6145
6146 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6147
6148 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6149 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6150
6151 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6152 DAG.getCondCode(CCOpcode));
6153 if (VT.bitsEq(CCVT))
6154 return SetCC;
6155 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6156}
6157
6159 SelectionDAG &DAG) {
6160 EVT VT = N->getValueType(0);
6161
6162 unsigned CondCode = N->getConstantOperandVal(3);
6163 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6164 return DAG.getUNDEF(VT);
6165
6166 SDValue Src0 = N->getOperand(1);
6167 SDValue Src1 = N->getOperand(2);
6168 EVT CmpVT = Src0.getValueType();
6169 SDLoc SL(N);
6170
6171 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6172 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6173 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6174 }
6175
6176 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6177 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6178 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6179 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6180 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6181 DAG.getCondCode(CCOpcode));
6182 if (VT.bitsEq(CCVT))
6183 return SetCC;
6184 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6185}
6186
6188 SelectionDAG &DAG) {
6189 EVT VT = N->getValueType(0);
6190 SDValue Src = N->getOperand(1);
6191 SDLoc SL(N);
6192
6193 if (Src.getOpcode() == ISD::SETCC) {
6194 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6195 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6196 Src.getOperand(1), Src.getOperand(2));
6197 }
6198 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6199 // (ballot 0) -> 0
6200 if (Arg->isZero())
6201 return DAG.getConstant(0, SL, VT);
6202
6203 // (ballot 1) -> EXEC/EXEC_LO
6204 if (Arg->isOne()) {
6205 Register Exec;
6206 if (VT.getScalarSizeInBits() == 32)
6207 Exec = AMDGPU::EXEC_LO;
6208 else if (VT.getScalarSizeInBits() == 64)
6209 Exec = AMDGPU::EXEC;
6210 else
6211 return SDValue();
6212
6213 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6214 }
6215 }
6216
6217 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6218 // ISD::SETNE)
6219 return DAG.getNode(
6220 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6221 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6222}
6223
6225 SelectionDAG &DAG) {
6226 EVT VT = N->getValueType(0);
6227 unsigned ValSize = VT.getSizeInBits();
6228 unsigned IID = N->getConstantOperandVal(0);
6229 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6230 IID == Intrinsic::amdgcn_permlanex16;
6231 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6232 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6233 SDLoc SL(N);
6234 MVT IntVT = MVT::getIntegerVT(ValSize);
6235 const GCNSubtarget *ST = TLI.getSubtarget();
6236 unsigned SplitSize = 32;
6237 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6238 ST->hasDPALU_DPP() &&
6239 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6240 SplitSize = 64;
6241
6242 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6243 SDValue Src2, MVT ValT) -> SDValue {
6245 switch (IID) {
6246 case Intrinsic::amdgcn_permlane16:
6247 case Intrinsic::amdgcn_permlanex16:
6248 case Intrinsic::amdgcn_update_dpp:
6249 Operands.push_back(N->getOperand(6));
6250 Operands.push_back(N->getOperand(5));
6251 Operands.push_back(N->getOperand(4));
6252 [[fallthrough]];
6253 case Intrinsic::amdgcn_writelane:
6254 Operands.push_back(Src2);
6255 [[fallthrough]];
6256 case Intrinsic::amdgcn_readlane:
6257 case Intrinsic::amdgcn_set_inactive:
6258 case Intrinsic::amdgcn_set_inactive_chain_arg:
6259 case Intrinsic::amdgcn_mov_dpp8:
6260 Operands.push_back(Src1);
6261 [[fallthrough]];
6262 case Intrinsic::amdgcn_readfirstlane:
6263 case Intrinsic::amdgcn_permlane64:
6264 Operands.push_back(Src0);
6265 break;
6266 default:
6267 llvm_unreachable("unhandled lane op");
6268 }
6269
6270 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6271 std::reverse(Operands.begin(), Operands.end());
6272
6273 if (SDNode *GL = N->getGluedNode()) {
6274 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6275 GL = GL->getOperand(0).getNode();
6276 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6277 SDValue(GL, 0)));
6278 }
6279
6280 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6281 };
6282
6283 SDValue Src0 = N->getOperand(1);
6284 SDValue Src1, Src2;
6285 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6286 IID == Intrinsic::amdgcn_mov_dpp8 ||
6287 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6288 Src1 = N->getOperand(2);
6289 if (IID == Intrinsic::amdgcn_writelane ||
6290 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6291 Src2 = N->getOperand(3);
6292 }
6293
6294 if (ValSize == SplitSize) {
6295 // Already legal
6296 return SDValue();
6297 }
6298
6299 if (ValSize < 32) {
6300 bool IsFloat = VT.isFloatingPoint();
6301 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6302 SL, MVT::i32);
6303
6304 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6305 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6306 SL, MVT::i32);
6307 }
6308
6309 if (IID == Intrinsic::amdgcn_writelane) {
6310 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6311 SL, MVT::i32);
6312 }
6313
6314 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6315 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6316 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6317 }
6318
6319 if (ValSize % SplitSize != 0)
6320 return SDValue();
6321
6322 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6323 EVT VT = N->getValueType(0);
6324 unsigned NE = VT.getVectorNumElements();
6325 EVT EltVT = VT.getVectorElementType();
6327 unsigned NumOperands = N->getNumOperands();
6328 SmallVector<SDValue, 4> Operands(NumOperands);
6329 SDNode *GL = N->getGluedNode();
6330
6331 // only handle convergencectrl_glue
6333
6334 for (unsigned i = 0; i != NE; ++i) {
6335 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6336 ++j) {
6337 SDValue Operand = N->getOperand(j);
6338 EVT OperandVT = Operand.getValueType();
6339 if (OperandVT.isVector()) {
6340 // A vector operand; extract a single element.
6341 EVT OperandEltVT = OperandVT.getVectorElementType();
6342 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6343 Operand, DAG.getVectorIdxConstant(i, SL));
6344 } else {
6345 // A scalar operand; just use it as is.
6346 Operands[j] = Operand;
6347 }
6348 }
6349
6350 if (GL)
6351 Operands[NumOperands - 1] =
6352 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6353 SDValue(GL->getOperand(0).getNode(), 0));
6354
6355 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6356 }
6357
6358 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6359 return DAG.getBuildVector(VecVT, SL, Scalars);
6360 };
6361
6362 if (VT.isVector()) {
6363 switch (MVT::SimpleValueType EltTy =
6365 case MVT::i32:
6366 case MVT::f32:
6367 if (SplitSize == 32) {
6368 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6369 return unrollLaneOp(LaneOp.getNode());
6370 }
6371 [[fallthrough]];
6372 case MVT::i16:
6373 case MVT::f16:
6374 case MVT::bf16: {
6375 unsigned SubVecNumElt =
6376 SplitSize / VT.getVectorElementType().getSizeInBits();
6377 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6379 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6380 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6381 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6382 DAG.getConstant(EltIdx, SL, MVT::i32));
6383
6384 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6385 IsPermLane16)
6386 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6387 DAG.getConstant(EltIdx, SL, MVT::i32));
6388
6389 if (IID == Intrinsic::amdgcn_writelane)
6390 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6391 DAG.getConstant(EltIdx, SL, MVT::i32));
6392
6393 Pieces.push_back(
6394 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6395 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6396 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6397 EltIdx += SubVecNumElt;
6398 }
6399 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6400 }
6401 default:
6402 // Handle all other cases by bitcasting to i32 vectors
6403 break;
6404 }
6405 }
6406
6407 MVT VecVT =
6408 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6409 Src0 = DAG.getBitcast(VecVT, Src0);
6410
6411 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6412 Src1 = DAG.getBitcast(VecVT, Src1);
6413
6414 if (IID == Intrinsic::amdgcn_writelane)
6415 Src2 = DAG.getBitcast(VecVT, Src2);
6416
6417 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6418 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6419 return DAG.getBitcast(VT, UnrolledLaneOp);
6420}
6421
6424 SelectionDAG &DAG) const {
6425 switch (N->getOpcode()) {
6427 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6428 Results.push_back(Res);
6429 return;
6430 }
6432 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6433 Results.push_back(Res);
6434 return;
6435 }
6437 unsigned IID = N->getConstantOperandVal(0);
6438 switch (IID) {
6439 case Intrinsic::amdgcn_make_buffer_rsrc:
6440 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6441 return;
6442 case Intrinsic::amdgcn_cvt_pkrtz: {
6443 SDValue Src0 = N->getOperand(1);
6444 SDValue Src1 = N->getOperand(2);
6445 SDLoc SL(N);
6446 SDValue Cvt =
6447 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6448 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6449 return;
6450 }
6451 case Intrinsic::amdgcn_cvt_pknorm_i16:
6452 case Intrinsic::amdgcn_cvt_pknorm_u16:
6453 case Intrinsic::amdgcn_cvt_pk_i16:
6454 case Intrinsic::amdgcn_cvt_pk_u16: {
6455 SDValue Src0 = N->getOperand(1);
6456 SDValue Src1 = N->getOperand(2);
6457 SDLoc SL(N);
6458 unsigned Opcode;
6459
6460 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6462 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6464 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6466 else
6468
6469 EVT VT = N->getValueType(0);
6470 if (isTypeLegal(VT))
6471 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6472 else {
6473 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6474 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6475 }
6476 return;
6477 }
6478 case Intrinsic::amdgcn_s_buffer_load: {
6479 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6480 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6481 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6482 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6483 // s_buffer_load_i8.
6484 if (!Subtarget->hasScalarSubwordLoads())
6485 return;
6486 SDValue Op = SDValue(N, 0);
6487 SDValue Rsrc = Op.getOperand(1);
6488 SDValue Offset = Op.getOperand(2);
6489 SDValue CachePolicy = Op.getOperand(3);
6490 EVT VT = Op.getValueType();
6491 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6492 SDLoc DL(Op);
6494 const DataLayout &DataLayout = DAG.getDataLayout();
6495 Align Alignment =
6501 VT.getStoreSize(), Alignment);
6502 SDValue LoadVal;
6503 if (!Offset->isDivergent()) {
6504 SDValue Ops[] = {Rsrc, // source register
6505 Offset, CachePolicy};
6506 SDValue BufferLoad =
6508 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6509 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6510 } else {
6511 SDValue Ops[] = {
6512 DAG.getEntryNode(), // Chain
6513 Rsrc, // rsrc
6514 DAG.getConstant(0, DL, MVT::i32), // vindex
6515 {}, // voffset
6516 {}, // soffset
6517 {}, // offset
6518 CachePolicy, // cachepolicy
6519 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6520 };
6521 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6522 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6523 }
6524 Results.push_back(LoadVal);
6525 return;
6526 }
6527 }
6528 break;
6529 }
6531 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6532 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6533 // FIXME: Hacky
6534 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6535 Results.push_back(Res.getOperand(I));
6536 }
6537 } else {
6538 Results.push_back(Res);
6539 Results.push_back(Res.getValue(1));
6540 }
6541 return;
6542 }
6543
6544 break;
6545 }
6546 case ISD::SELECT: {
6547 SDLoc SL(N);
6548 EVT VT = N->getValueType(0);
6549 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6550 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6551 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6552
6553 EVT SelectVT = NewVT;
6554 if (NewVT.bitsLT(MVT::i32)) {
6555 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6556 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6557 SelectVT = MVT::i32;
6558 }
6559
6560 SDValue NewSelect =
6561 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6562
6563 if (NewVT != SelectVT)
6564 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6565 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6566 return;
6567 }
6568 case ISD::FNEG: {
6569 if (N->getValueType(0) != MVT::v2f16)
6570 break;
6571
6572 SDLoc SL(N);
6573 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6574
6575 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6576 DAG.getConstant(0x80008000, SL, MVT::i32));
6577 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6578 return;
6579 }
6580 case ISD::FABS: {
6581 if (N->getValueType(0) != MVT::v2f16)
6582 break;
6583
6584 SDLoc SL(N);
6585 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6586
6587 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6588 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6589 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6590 return;
6591 }
6592 case ISD::FSQRT: {
6593 if (N->getValueType(0) != MVT::f16)
6594 break;
6595 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6596 break;
6597 }
6598 default:
6600 break;
6601 }
6602}
6603
6604/// Helper function for LowerBRCOND
6605static SDNode *findUser(SDValue Value, unsigned Opcode) {
6606
6607 for (SDUse &U : Value->uses()) {
6608 if (U.get() != Value)
6609 continue;
6610
6611 if (U.getUser()->getOpcode() == Opcode)
6612 return U.getUser();
6613 }
6614 return nullptr;
6615}
6616
6617unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6618 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6619 switch (Intr->getConstantOperandVal(1)) {
6620 case Intrinsic::amdgcn_if:
6621 return AMDGPUISD::IF;
6622 case Intrinsic::amdgcn_else:
6623 return AMDGPUISD::ELSE;
6624 case Intrinsic::amdgcn_loop:
6625 return AMDGPUISD::LOOP;
6626 case Intrinsic::amdgcn_end_cf:
6627 llvm_unreachable("should not occur");
6628 default:
6629 return 0;
6630 }
6631 }
6632
6633 // break, if_break, else_break are all only used as inputs to loop, not
6634 // directly as branch conditions.
6635 return 0;
6636}
6637
6639 const Triple &TT = getTargetMachine().getTargetTriple();
6643}
6644
6646 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6647 return false;
6648
6649 // FIXME: Either avoid relying on address space here or change the default
6650 // address space for functions to avoid the explicit check.
6651 return (GV->getValueType()->isFunctionTy() ||
6654}
6655
6657 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6658}
6659
6661 if (!GV->hasExternalLinkage())
6662 return true;
6663
6664 const auto OS = getTargetMachine().getTargetTriple().getOS();
6665 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6666}
6667
6668/// This transforms the control flow intrinsics to get the branch destination as
6669/// last parameter, also switches branch target with BR if the need arise
6670SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6671 SDLoc DL(BRCOND);
6672
6673 SDNode *Intr = BRCOND.getOperand(1).getNode();
6674 SDValue Target = BRCOND.getOperand(2);
6675 SDNode *BR = nullptr;
6676 SDNode *SetCC = nullptr;
6677
6678 if (Intr->getOpcode() == ISD::SETCC) {
6679 // As long as we negate the condition everything is fine
6680 SetCC = Intr;
6681 Intr = SetCC->getOperand(0).getNode();
6682
6683 } else {
6684 // Get the target from BR if we don't negate the condition
6685 BR = findUser(BRCOND, ISD::BR);
6686 assert(BR && "brcond missing unconditional branch user");
6687 Target = BR->getOperand(1);
6688 }
6689
6690 unsigned CFNode = isCFIntrinsic(Intr);
6691 if (CFNode == 0) {
6692 // This is a uniform branch so we don't need to legalize.
6693 return BRCOND;
6694 }
6695
6696 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6697 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6698
6699 assert(!SetCC ||
6700 (SetCC->getConstantOperandVal(1) == 1 &&
6701 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6702 ISD::SETNE));
6703
6704 // operands of the new intrinsic call
6706 if (HaveChain)
6707 Ops.push_back(BRCOND.getOperand(0));
6708
6709 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6710 Ops.push_back(Target);
6711
6712 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6713
6714 // build the new intrinsic call
6715 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6716
6717 if (!HaveChain) {
6718 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6719
6720 Result = DAG.getMergeValues(Ops, DL).getNode();
6721 }
6722
6723 if (BR) {
6724 // Give the branch instruction our target
6725 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6726 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6727 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6728 }
6729
6730 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6731
6732 // Copy the intrinsic results to registers
6733 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6735 if (!CopyToReg)
6736 continue;
6737
6738 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6739 SDValue(Result, i - 1), SDValue());
6740
6741 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6742 }
6743
6744 // Remove the old intrinsic from the chain
6745 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6746 Intr->getOperand(0));
6747
6748 return Chain;
6749}
6750
6751SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6752 MVT VT = Op.getSimpleValueType();
6753 SDLoc DL(Op);
6754 // Checking the depth
6755 if (Op.getConstantOperandVal(0) != 0)
6756 return DAG.getConstant(0, DL, VT);
6757
6760 // Check for kernel and shader functions
6761 if (Info->isEntryFunction())
6762 return DAG.getConstant(0, DL, VT);
6763
6764 MachineFrameInfo &MFI = MF.getFrameInfo();
6765 // There is a call to @llvm.returnaddress in this function
6766 MFI.setReturnAddressIsTaken(true);
6767
6769 // Get the return address reg and mark it as an implicit live-in
6770 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6771 getRegClassFor(VT, Op.getNode()->isDivergent()));
6772
6773 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6774}
6775
6776SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6777 const SDLoc &DL, EVT VT) const {
6778 return Op.getValueType().bitsLE(VT)
6779 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6780 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6781 DAG.getTargetConstant(0, DL, MVT::i32));
6782}
6783
6784SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6785 assert(Op.getValueType() == MVT::f16 &&
6786 "Do not know how to custom lower FP_ROUND for non-f16 type");
6787
6788 SDValue Src = Op.getOperand(0);
6789 EVT SrcVT = Src.getValueType();
6790 if (SrcVT != MVT::f64)
6791 return Op;
6792
6793 // TODO: Handle strictfp
6794 if (Op.getOpcode() != ISD::FP_ROUND)
6795 return Op;
6796
6797 SDLoc DL(Op);
6798
6799 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6800 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6801 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6802}
6803
6804SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6805 SelectionDAG &DAG) const {
6806 EVT VT = Op.getValueType();
6807 const MachineFunction &MF = DAG.getMachineFunction();
6809 bool IsIEEEMode = Info->getMode().IEEE;
6810
6811 // FIXME: Assert during selection that this is only selected for
6812 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6813 // mode functions, but this happens to be OK since it's only done in cases
6814 // where there is known no sNaN.
6815 if (IsIEEEMode)
6816 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6817
6818 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6819 VT == MVT::v16bf16)
6820 return splitBinaryVectorOp(Op, DAG);
6821 return Op;
6822}
6823
6824SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6825 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6826 EVT VT = Op.getValueType();
6827 assert(VT == MVT::f16);
6828
6829 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6830 EVT ExpVT = Exp.getValueType();
6831 if (ExpVT == MVT::i16)
6832 return Op;
6833
6834 SDLoc DL(Op);
6835
6836 // Correct the exponent type for f16 to i16.
6837 // Clamp the range of the exponent to the instruction's range.
6838
6839 // TODO: This should be a generic narrowing legalization, and can easily be
6840 // for GlobalISel.
6841
6842 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6843 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6844
6845 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6846 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6847
6848 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6849
6850 if (IsStrict) {
6851 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6852 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6853 }
6854
6855 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6856}
6857
6859 switch (Op->getOpcode()) {
6860 case ISD::SRA:
6861 case ISD::SMIN:
6862 case ISD::SMAX:
6863 return ISD::SIGN_EXTEND;
6864 case ISD::SRL:
6865 case ISD::UMIN:
6866 case ISD::UMAX:
6867 return ISD::ZERO_EXTEND;
6868 case ISD::ADD:
6869 case ISD::SUB:
6870 case ISD::AND:
6871 case ISD::OR:
6872 case ISD::XOR:
6873 case ISD::SHL:
6874 case ISD::SELECT:
6875 case ISD::MUL:
6876 // operation result won't be influenced by garbage high bits.
6877 // TODO: are all of those cases correct, and are there more?
6878 return ISD::ANY_EXTEND;
6879 case ISD::SETCC: {
6880 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6882 }
6883 default:
6884 llvm_unreachable("unexpected opcode!");
6885 }
6886}
6887
6888SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6889 DAGCombinerInfo &DCI) const {
6890 const unsigned Opc = Op.getOpcode();
6891 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6892 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6893 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6894 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6895 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6896
6897 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6898 : Op->getOperand(0).getValueType();
6899 auto ExtTy = OpTy.changeElementType(MVT::i32);
6900
6901 if (DCI.isBeforeLegalizeOps() ||
6902 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6903 return SDValue();
6904
6905 auto &DAG = DCI.DAG;
6906
6907 SDLoc DL(Op);
6908 SDValue LHS;
6909 SDValue RHS;
6910 if (Opc == ISD::SELECT) {
6911 LHS = Op->getOperand(1);
6912 RHS = Op->getOperand(2);
6913 } else {
6914 LHS = Op->getOperand(0);
6915 RHS = Op->getOperand(1);
6916 }
6917
6918 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6919 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6920
6921 // Special case: for shifts, the RHS always needs a zext.
6922 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6923 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6924 else
6925 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6926
6927 // setcc always return i1/i1 vec so no need to truncate after.
6928 if (Opc == ISD::SETCC) {
6929 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6930 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6931 }
6932
6933 // For other ops, we extend the operation's return type as well so we need to
6934 // truncate back to the original type.
6935 SDValue NewVal;
6936 if (Opc == ISD::SELECT)
6937 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6938 else
6939 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6940
6941 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6942}
6943
6944// Custom lowering for vector multiplications and s_mul_u64.
6945SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6946 EVT VT = Op.getValueType();
6947
6948 // Split vector operands.
6949 if (VT.isVector())
6950 return splitBinaryVectorOp(Op, DAG);
6951
6952 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6953
6954 // There are four ways to lower s_mul_u64:
6955 //
6956 // 1. If all the operands are uniform, then we lower it as it is.
6957 //
6958 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6959 // multiplications because there is not a vector equivalent of s_mul_u64.
6960 //
6961 // 3. If the cost model decides that it is more efficient to use vector
6962 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6963 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6964 //
6965 // 4. If the cost model decides to use vector registers and both of the
6966 // operands are zero-extended/sign-extended from 32-bits, then we split the
6967 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6968 // possible to check if the operands are zero-extended or sign-extended in
6969 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6970 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6971 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6972 // If the cost model decides that we have to use vector registers, then
6973 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6974 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6975 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6976 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6977 // SIInstrInfo.cpp .
6978
6979 if (Op->isDivergent())
6980 return SDValue();
6981
6982 SDValue Op0 = Op.getOperand(0);
6983 SDValue Op1 = Op.getOperand(1);
6984 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6985 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6986 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6987 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6988 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6989 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6990 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6991 SDLoc SL(Op);
6992 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6993 return SDValue(
6994 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6995 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6996 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6997 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6998 return SDValue(
6999 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7000 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7001 return Op;
7002}
7003
7004SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7005 EVT VT = Op.getValueType();
7006 SDLoc SL(Op);
7007 SDValue LHS = Op.getOperand(0);
7008 SDValue RHS = Op.getOperand(1);
7009 bool isSigned = Op.getOpcode() == ISD::SMULO;
7010
7011 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7012 const APInt &C = RHSC->getAPIntValue();
7013 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7014 if (C.isPowerOf2()) {
7015 // smulo(x, signed_min) is same as umulo(x, signed_min).
7016 bool UseArithShift = isSigned && !C.isMinSignedValue();
7017 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7018 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7019 SDValue Overflow =
7020 DAG.getSetCC(SL, MVT::i1,
7021 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7022 Result, ShiftAmt),
7023 LHS, ISD::SETNE);
7024 return DAG.getMergeValues({Result, Overflow}, SL);
7025 }
7026 }
7027
7028 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7029 SDValue Top =
7030 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7031
7032 SDValue Sign = isSigned
7033 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7034 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7035 SL, MVT::i32))
7036 : DAG.getConstant(0, SL, VT);
7037 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7038
7039 return DAG.getMergeValues({Result, Overflow}, SL);
7040}
7041
7042SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7043 if (Op->isDivergent()) {
7044 // Select to V_MAD_[IU]64_[IU]32.
7045 return Op;
7046 }
7047 if (Subtarget->hasSMulHi()) {
7048 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7049 return SDValue();
7050 }
7051 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7052 // calculate the high part, so we might as well do the whole thing with
7053 // V_MAD_[IU]64_[IU]32.
7054 return Op;
7055}
7056
7057SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7058 if (!Subtarget->isTrapHandlerEnabled() ||
7060 return lowerTrapEndpgm(Op, DAG);
7061
7062 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7063 : lowerTrapHsaQueuePtr(Op, DAG);
7064}
7065
7066SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7067 SDLoc SL(Op);
7068 SDValue Chain = Op.getOperand(0);
7069 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7070}
7071
7072SDValue
7073SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7074 const SDLoc &DL, Align Alignment,
7075 ImplicitParameter Param) const {
7078 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7080 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7083}
7084
7085SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7086 SelectionDAG &DAG) const {
7087 SDLoc SL(Op);
7088 SDValue Chain = Op.getOperand(0);
7089
7090 SDValue QueuePtr;
7091 // For code object version 5, QueuePtr is passed through implicit kernarg.
7092 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7094 QueuePtr =
7095 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7096 } else {
7099 Register UserSGPR = Info->getQueuePtrUserSGPR();
7100
7101 if (UserSGPR == AMDGPU::NoRegister) {
7102 // We probably are in a function incorrectly marked with
7103 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7104 // trap, so just use a null pointer.
7105 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7106 } else {
7107 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7108 MVT::i64);
7109 }
7110 }
7111
7112 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7113 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7114
7116 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7117 ToReg.getValue(1)};
7118 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7119}
7120
7121SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7122 SDLoc SL(Op);
7123 SDValue Chain = Op.getOperand(0);
7124
7125 // We need to simulate the 's_trap 2' instruction on targets that run in
7126 // PRIV=1 (where it is treated as a nop).
7127 if (Subtarget->hasPrivEnabledTrap2NopBug())
7128 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7129
7131 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7132 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7133}
7134
7135SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7136 SDLoc SL(Op);
7137 SDValue Chain = Op.getOperand(0);
7139
7140 if (!Subtarget->isTrapHandlerEnabled() ||
7143 "debugtrap handler not supported",
7144 Op.getDebugLoc(), DS_Warning);
7145 LLVMContext &Ctx = MF.getFunction().getContext();
7146 Ctx.diagnose(NoTrap);
7147 return Chain;
7148 }
7149
7150 uint64_t TrapID =
7152 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7153 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7154}
7155
7156SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7157 SelectionDAG &DAG) const {
7158 if (Subtarget->hasApertureRegs()) {
7159 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7160 ? AMDGPU::SRC_SHARED_BASE
7161 : AMDGPU::SRC_PRIVATE_BASE;
7162 // Note: this feature (register) is broken. When used as a 32-bit operand,
7163 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7164 // bits.
7165 //
7166 // To work around the issue, directly emit a 64 bit mov from this register
7167 // then extract the high bits. Note that this shouldn't even result in a
7168 // shift being emitted and simply become a pair of registers (e.g.):
7169 // s_mov_b64 s[6:7], src_shared_base
7170 // v_mov_b32_e32 v1, s7
7171 //
7172 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7173 // coalescing would kick in and it would think it's okay to use the "HI"
7174 // subregister directly (instead of extracting the HI 32 bits) which is an
7175 // artificial (unusable) register.
7176 // Register TableGen definitions would need an overhaul to get rid of the
7177 // artificial "HI" aperture registers and prevent this kind of issue from
7178 // happening.
7179 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7180 DAG.getRegister(ApertureRegNo, MVT::i64));
7181 return DAG.getNode(
7182 ISD::TRUNCATE, DL, MVT::i32,
7183 DAG.getNode(ISD::SRL, DL, MVT::i64,
7184 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7185 }
7186
7187 // For code object version 5, private_base and shared_base are passed through
7188 // implicit kernargs.
7189 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7193 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7194 }
7195
7198 Register UserSGPR = Info->getQueuePtrUserSGPR();
7199 if (UserSGPR == AMDGPU::NoRegister) {
7200 // We probably are in a function incorrectly marked with
7201 // amdgpu-no-queue-ptr. This is undefined.
7202 return DAG.getUNDEF(MVT::i32);
7203 }
7204
7205 SDValue QueuePtr =
7206 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7207
7208 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7209 // private_segment_aperture_base_hi.
7210 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7211
7212 SDValue Ptr =
7213 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7214
7215 // TODO: Use custom target PseudoSourceValue.
7216 // TODO: We should use the value from the IR intrinsic call, but it might not
7217 // be available and how do we get it?
7219 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7220 commonAlignment(Align(64), StructOffset),
7223}
7224
7225/// Return true if the value is a known valid address, such that a null check is
7226/// not necessary.
7228 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7229 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7230 isa<BasicBlockSDNode>(Val))
7231 return true;
7232
7233 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7234 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7235
7236 // TODO: Search through arithmetic, handle arguments and loads
7237 // marked nonnull.
7238 return false;
7239}
7240
7241SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7242 SelectionDAG &DAG) const {
7243 SDLoc SL(Op);
7244
7245 const AMDGPUTargetMachine &TM =
7246 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7247
7248 unsigned DestAS, SrcAS;
7249 SDValue Src;
7250 bool IsNonNull = false;
7251 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7252 SrcAS = ASC->getSrcAddressSpace();
7253 Src = ASC->getOperand(0);
7254 DestAS = ASC->getDestAddressSpace();
7255 } else {
7256 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7257 Op.getConstantOperandVal(0) ==
7258 Intrinsic::amdgcn_addrspacecast_nonnull);
7259 Src = Op->getOperand(1);
7260 SrcAS = Op->getConstantOperandVal(2);
7261 DestAS = Op->getConstantOperandVal(3);
7262 IsNonNull = true;
7263 }
7264
7265 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7266
7267 // flat -> local/private
7268 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7269 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7270 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7271 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7272
7273 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7274 return Ptr;
7275
7276 unsigned NullVal = TM.getNullPointerValue(DestAS);
7277 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7278 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7279
7280 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7281 SegmentNullPtr);
7282 }
7283 }
7284
7285 // local/private -> flat
7286 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7287 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7288 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7289
7290 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7291 SDValue CvtPtr =
7292 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7293 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7294
7295 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7296 return CvtPtr;
7297
7298 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7299 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7300
7301 SDValue NonNull =
7302 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7303
7304 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7305 FlatNullPtr);
7306 }
7307 }
7308
7309 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7310 Op.getValueType() == MVT::i64) {
7313 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7314 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7315 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7316 }
7317
7318 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7319 Src.getValueType() == MVT::i64)
7320 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7321
7322 // global <-> flat are no-ops and never emitted.
7323
7324 const MachineFunction &MF = DAG.getMachineFunction();
7325 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7326 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7327 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7328
7329 return DAG.getUNDEF(Op->getValueType(0));
7330}
7331
7332// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7333// the small vector and inserting them into the big vector. That is better than
7334// the default expansion of doing it via a stack slot. Even though the use of
7335// the stack slot would be optimized away afterwards, the stack slot itself
7336// remains.
7337SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7338 SelectionDAG &DAG) const {
7339 SDValue Vec = Op.getOperand(0);
7340 SDValue Ins = Op.getOperand(1);
7341 SDValue Idx = Op.getOperand(2);
7342 EVT VecVT = Vec.getValueType();
7343 EVT InsVT = Ins.getValueType();
7344 EVT EltVT = VecVT.getVectorElementType();
7345 unsigned InsNumElts = InsVT.getVectorNumElements();
7346 unsigned IdxVal = Idx->getAsZExtVal();
7347 SDLoc SL(Op);
7348
7349 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7350 // Insert 32-bit registers at a time.
7351 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7352
7353 unsigned VecNumElts = VecVT.getVectorNumElements();
7354 EVT NewVecVT =
7355 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7356 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7358 MVT::i32, InsNumElts / 2);
7359
7360 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7361 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7362
7363 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7364 SDValue Elt;
7365 if (InsNumElts == 2) {
7366 Elt = Ins;
7367 } else {
7368 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7369 DAG.getConstant(I, SL, MVT::i32));
7370 }
7371 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7372 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7373 }
7374
7375 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7376 }
7377
7378 for (unsigned I = 0; I != InsNumElts; ++I) {
7379 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7380 DAG.getConstant(I, SL, MVT::i32));
7381 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7382 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7383 }
7384 return Vec;
7385}
7386
7387SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7388 SelectionDAG &DAG) const {
7389 SDValue Vec = Op.getOperand(0);
7390 SDValue InsVal = Op.getOperand(1);
7391 SDValue Idx = Op.getOperand(2);
7392 EVT VecVT = Vec.getValueType();
7393 EVT EltVT = VecVT.getVectorElementType();
7394 unsigned VecSize = VecVT.getSizeInBits();
7395 unsigned EltSize = EltVT.getSizeInBits();
7396 SDLoc SL(Op);
7397
7398 // Specially handle the case of v4i16 with static indexing.
7399 unsigned NumElts = VecVT.getVectorNumElements();
7400 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7401 if (NumElts == 4 && EltSize == 16 && KIdx) {
7402 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7403
7404 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7405 DAG.getConstant(0, SL, MVT::i32));
7406 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7407 DAG.getConstant(1, SL, MVT::i32));
7408
7409 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7410 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7411
7412 unsigned Idx = KIdx->getZExtValue();
7413 bool InsertLo = Idx < 2;
7414 SDValue InsHalf = DAG.getNode(
7415 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7416 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7417 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7418
7419 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7420
7421 SDValue Concat =
7422 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7423 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7424
7425 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7426 }
7427
7428 // Static indexing does not lower to stack access, and hence there is no need
7429 // for special custom lowering to avoid stack access.
7430 if (isa<ConstantSDNode>(Idx))
7431 return SDValue();
7432
7433 // Avoid stack access for dynamic indexing by custom lowering to
7434 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7435
7436 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7437
7438 MVT IntVT = MVT::getIntegerVT(VecSize);
7439
7440 // Convert vector index to bit-index and get the required bit mask.
7441 assert(isPowerOf2_32(EltSize));
7442 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7443 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7444 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7445 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7446 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7447
7448 // 1. Create a congruent vector with the target value in each element.
7449 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7450 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7451
7452 // 2. Mask off all other indices except the required index within (1).
7453 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7454
7455 // 3. Mask off the required index within the target vector.
7456 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7457 SDValue RHS =
7458 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7459
7460 // 4. Get (2) and (3) ORed into the target vector.
7461 SDValue BFI =
7462 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7463
7464 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7465}
7466
7467SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7468 SelectionDAG &DAG) const {
7469 SDLoc SL(Op);
7470
7471 EVT ResultVT = Op.getValueType();
7472 SDValue Vec = Op.getOperand(0);
7473 SDValue Idx = Op.getOperand(1);
7474 EVT VecVT = Vec.getValueType();
7475 unsigned VecSize = VecVT.getSizeInBits();
7476 EVT EltVT = VecVT.getVectorElementType();
7477
7478 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7479
7480 // Make sure we do any optimizations that will make it easier to fold
7481 // source modifiers before obscuring it with bit operations.
7482
7483 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7484 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7485 return Combined;
7486
7487 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7488 SDValue Lo, Hi;
7489 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7490
7491 if (VecSize == 128) {
7492 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7493 Lo = DAG.getBitcast(LoVT,
7494 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7495 DAG.getConstant(0, SL, MVT::i32)));
7496 Hi = DAG.getBitcast(HiVT,
7497 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7498 DAG.getConstant(1, SL, MVT::i32)));
7499 } else if (VecSize == 256) {
7500 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7501 SDValue Parts[4];
7502 for (unsigned P = 0; P < 4; ++P) {
7503 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7504 DAG.getConstant(P, SL, MVT::i32));
7505 }
7506
7507 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7508 Parts[0], Parts[1]));
7509 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7510 Parts[2], Parts[3]));
7511 } else {
7512 assert(VecSize == 512);
7513
7514 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7515 SDValue Parts[8];
7516 for (unsigned P = 0; P < 8; ++P) {
7517 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7518 DAG.getConstant(P, SL, MVT::i32));
7519 }
7520
7521 Lo = DAG.getBitcast(LoVT,
7522 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7523 Parts[0], Parts[1], Parts[2], Parts[3]));
7524 Hi = DAG.getBitcast(HiVT,
7525 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7526 Parts[4], Parts[5], Parts[6], Parts[7]));
7527 }
7528
7529 EVT IdxVT = Idx.getValueType();
7530 unsigned NElem = VecVT.getVectorNumElements();
7531 assert(isPowerOf2_32(NElem));
7532 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7533 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7534 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7535 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7536 }
7537
7538 assert(VecSize <= 64);
7539
7540 MVT IntVT = MVT::getIntegerVT(VecSize);
7541
7542 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7543 SDValue VecBC = peekThroughBitcasts(Vec);
7544 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7545 SDValue Src = VecBC.getOperand(0);
7546 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7547 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7548 }
7549
7550 unsigned EltSize = EltVT.getSizeInBits();
7551 assert(isPowerOf2_32(EltSize));
7552
7553 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7554
7555 // Convert vector index to bit-index (* EltSize)
7556 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7557
7558 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7559 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7560
7561 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7562 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7563 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7564 }
7565
7566 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7567}
7568
7569static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7570 assert(Elt % 2 == 0);
7571 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7572}
7573
7574SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7575 SelectionDAG &DAG) const {
7576 SDLoc SL(Op);
7577 EVT ResultVT = Op.getValueType();
7578 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7579 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7580 MVT PackVT = MVT::getVectorVT(EltVT, 2);
7581 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7582
7583 // vector_shuffle <0,1,6,7> lhs, rhs
7584 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7585 //
7586 // vector_shuffle <6,7,2,3> lhs, rhs
7587 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7588 //
7589 // vector_shuffle <6,7,0,1> lhs, rhs
7590 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7591
7592 // Avoid scalarizing when both halves are reading from consecutive elements.
7594 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7595 if (elementPairIsContiguous(SVN->getMask(), I)) {
7596 const int Idx = SVN->getMaskElt(I);
7597 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7598 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7599 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7600 SVN->getOperand(VecIdx),
7601 DAG.getConstant(EltIdx, SL, MVT::i32));
7602 Pieces.push_back(SubVec);
7603 } else {
7604 const int Idx0 = SVN->getMaskElt(I);
7605 const int Idx1 = SVN->getMaskElt(I + 1);
7606 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7607 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7608 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7609 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7610
7611 SDValue Vec0 = SVN->getOperand(VecIdx0);
7612 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7613 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7614
7615 SDValue Vec1 = SVN->getOperand(VecIdx1);
7616 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7617 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7618 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7619 }
7620 }
7621
7622 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7623}
7624
7625SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7626 SelectionDAG &DAG) const {
7627 SDValue SVal = Op.getOperand(0);
7628 EVT ResultVT = Op.getValueType();
7629 EVT SValVT = SVal.getValueType();
7630 SDValue UndefVal = DAG.getUNDEF(SValVT);
7631 SDLoc SL(Op);
7632
7634 VElts.push_back(SVal);
7635 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7636 VElts.push_back(UndefVal);
7637
7638 return DAG.getBuildVector(ResultVT, SL, VElts);
7639}
7640
7641SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7642 SelectionDAG &DAG) const {
7643 SDLoc SL(Op);
7644 EVT VT = Op.getValueType();
7645
7646 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7647 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7648
7649 SDValue Lo = Op.getOperand(0);
7650 SDValue Hi = Op.getOperand(1);
7651
7652 // Avoid adding defined bits with the zero_extend.
7653 if (Hi.isUndef()) {
7654 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7655 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7656 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7657 }
7658
7659 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7660 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7661
7662 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7663 DAG.getConstant(16, SL, MVT::i32));
7664 if (Lo.isUndef())
7665 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7666
7667 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7668 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7669
7670 SDValue Or =
7671 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7672 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7673 }
7674
7675 // Split into 2-element chunks.
7676 const unsigned NumParts = VT.getVectorNumElements() / 2;
7678 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7679
7681 for (unsigned P = 0; P < NumParts; ++P) {
7682 SDValue Vec = DAG.getBuildVector(
7683 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7684 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7685 }
7686
7687 SDValue Blend =
7688 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7689 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7690}
7691
7693 const GlobalAddressSDNode *GA) const {
7694 // OSes that use ELF REL relocations (instead of RELA) can only store a
7695 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7696 // which can create arbitrary 64-bit addends. (This is only a problem for
7697 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7698 // the high 32 bits of the addend.)
7699 //
7700 // This should be kept in sync with how HasRelocationAddend is initialized in
7701 // the constructor of ELFAMDGPUAsmBackend.
7702 if (!Subtarget->isAmdHsaOS())
7703 return false;
7704
7705 // We can fold offsets for anything that doesn't require a GOT relocation.
7706 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7710}
7711
7712static SDValue
7714 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7715 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7716 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7717 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7718 // lowered to the following code sequence:
7719 //
7720 // For constant address space:
7721 // s_getpc_b64 s[0:1]
7722 // s_add_u32 s0, s0, $symbol
7723 // s_addc_u32 s1, s1, 0
7724 //
7725 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7726 // a fixup or relocation is emitted to replace $symbol with a literal
7727 // constant, which is a pc-relative offset from the encoding of the $symbol
7728 // operand to the global variable.
7729 //
7730 // For global address space:
7731 // s_getpc_b64 s[0:1]
7732 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7733 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7734 //
7735 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7736 // fixups or relocations are emitted to replace $symbol@*@lo and
7737 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7738 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7739 // operand to the global variable.
7740 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7741 SDValue PtrHi;
7742 if (GAFlags == SIInstrInfo::MO_NONE)
7743 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7744 else
7745 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7746 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7747}
7748
7749SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7750 SDValue Op,
7751 SelectionDAG &DAG) const {
7752 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7753 SDLoc DL(GSD);
7754 EVT PtrVT = Op.getValueType();
7755
7756 const GlobalValue *GV = GSD->getGlobal();
7762 GV->hasExternalLinkage()) {
7763 Type *Ty = GV->getValueType();
7764 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7765 // zero-sized type in other languages to declare the dynamic shared
7766 // memory which size is not known at the compile time. They will be
7767 // allocated by the runtime and placed directly after the static
7768 // allocated ones. They all share the same offset.
7769 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7770 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7771 // Adjust alignment for that dynamic shared memory array.
7773 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7774 MFI->setUsesDynamicLDS(true);
7775 return SDValue(
7776 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7777 }
7778 }
7780 }
7781
7783 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7785 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7786 }
7787
7788 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7789 SDValue AddrLo = DAG.getTargetGlobalAddress(
7790 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7791 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7792
7793 SDValue AddrHi = DAG.getTargetGlobalAddress(
7794 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7795 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7796
7797 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7798 }
7799
7800 if (shouldEmitFixup(GV))
7801 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7802
7803 if (shouldEmitPCReloc(GV))
7804 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7806
7807 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7809
7810 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7812 const DataLayout &DataLayout = DAG.getDataLayout();
7813 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7814 MachinePointerInfo PtrInfo =
7816
7817 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7820}
7821
7823 const SDLoc &DL, SDValue V) const {
7824 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7825 // the destination register.
7826 //
7827 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7828 // so we will end up with redundant moves to m0.
7829 //
7830 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7831
7832 // A Null SDValue creates a glue result.
7833 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7834 V, Chain);
7835 return SDValue(M0, 0);
7836}
7837
7838SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7839 MVT VT,
7840 unsigned Offset) const {
7841 SDLoc SL(Op);
7842 SDValue Param = lowerKernargMemParameter(
7843 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7844 // The local size values will have the hi 16-bits as zero.
7845 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7846 DAG.getValueType(VT));
7847}
7848
7850 EVT VT) {
7852 "non-hsa intrinsic with hsa target",
7853 DL.getDebugLoc());
7854 DAG.getContext()->diagnose(BadIntrin);
7855 return DAG.getUNDEF(VT);
7856}
7857
7859 EVT VT) {
7861 "intrinsic not supported on subtarget",
7862 DL.getDebugLoc());
7863 DAG.getContext()->diagnose(BadIntrin);
7864 return DAG.getUNDEF(VT);
7865}
7866
7868 ArrayRef<SDValue> Elts) {
7869 assert(!Elts.empty());
7870 MVT Type;
7871 unsigned NumElts = Elts.size();
7872
7873 if (NumElts <= 12) {
7874 Type = MVT::getVectorVT(MVT::f32, NumElts);
7875 } else {
7876 assert(Elts.size() <= 16);
7877 Type = MVT::v16f32;
7878 NumElts = 16;
7879 }
7880
7881 SmallVector<SDValue, 16> VecElts(NumElts);
7882 for (unsigned i = 0; i < Elts.size(); ++i) {
7883 SDValue Elt = Elts[i];
7884 if (Elt.getValueType() != MVT::f32)
7885 Elt = DAG.getBitcast(MVT::f32, Elt);
7886 VecElts[i] = Elt;
7887 }
7888 for (unsigned i = Elts.size(); i < NumElts; ++i)
7889 VecElts[i] = DAG.getUNDEF(MVT::f32);
7890
7891 if (NumElts == 1)
7892 return VecElts[0];
7893 return DAG.getBuildVector(Type, DL, VecElts);
7894}
7895
7896static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7897 SDValue Src, int ExtraElts) {
7898 EVT SrcVT = Src.getValueType();
7899
7901
7902 if (SrcVT.isVector())
7903 DAG.ExtractVectorElements(Src, Elts);
7904 else
7905 Elts.push_back(Src);
7906
7907 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7908 while (ExtraElts--)
7909 Elts.push_back(Undef);
7910
7911 return DAG.getBuildVector(CastVT, DL, Elts);
7912}
7913
7914// Re-construct the required return value for a image load intrinsic.
7915// This is more complicated due to the optional use TexFailCtrl which means the
7916// required return type is an aggregate
7918 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7919 bool Unpacked, bool IsD16, int DMaskPop,
7920 int NumVDataDwords, bool IsAtomicPacked16Bit,
7921 const SDLoc &DL) {
7922 // Determine the required return type. This is the same regardless of
7923 // IsTexFail flag
7924 EVT ReqRetVT = ResultTypes[0];
7925 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7926 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7927 ? (ReqRetNumElts + 1) / 2
7928 : ReqRetNumElts;
7929
7930 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7931
7932 MVT DataDwordVT =
7933 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7934
7935 MVT MaskPopVT =
7936 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7937
7938 SDValue Data(Result, 0);
7939 SDValue TexFail;
7940
7941 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7942 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7943 if (MaskPopVT.isVector()) {
7944 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7945 SDValue(Result, 0), ZeroIdx);
7946 } else {
7947 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7948 SDValue(Result, 0), ZeroIdx);
7949 }
7950 }
7951
7952 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7953 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7954 NumDataDwords - MaskPopDwords);
7955
7956 if (IsD16)
7957 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7958
7959 EVT LegalReqRetVT = ReqRetVT;
7960 if (!ReqRetVT.isVector()) {
7961 if (!Data.getValueType().isInteger())
7962 Data = DAG.getNode(ISD::BITCAST, DL,
7963 Data.getValueType().changeTypeToInteger(), Data);
7964 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7965 } else {
7966 // We need to widen the return vector to a legal type
7967 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7968 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7969 LegalReqRetVT =
7971 ReqRetVT.getVectorNumElements() + 1);
7972 }
7973 }
7974 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7975
7976 if (IsTexFail) {
7977 TexFail =
7978 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7979 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7980
7981 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7982 }
7983
7984 if (Result->getNumValues() == 1)
7985 return Data;
7986
7987 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7988}
7989
7990static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7991 SDValue *LWE, bool &IsTexFail) {
7992 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7993
7994 uint64_t Value = TexFailCtrlConst->getZExtValue();
7995 if (Value) {
7996 IsTexFail = true;
7997 }
7998
7999 SDLoc DL(TexFailCtrlConst);
8000 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8001 Value &= ~(uint64_t)0x1;
8002 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8003 Value &= ~(uint64_t)0x2;
8004
8005 return Value == 0;
8006}
8007
8009 MVT PackVectorVT,
8010 SmallVectorImpl<SDValue> &PackedAddrs,
8011 unsigned DimIdx, unsigned EndIdx,
8012 unsigned NumGradients) {
8013 SDLoc DL(Op);
8014 for (unsigned I = DimIdx; I < EndIdx; I++) {
8015 SDValue Addr = Op.getOperand(I);
8016
8017 // Gradients are packed with undef for each coordinate.
8018 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8019 // 1D: undef,dx/dh; undef,dx/dv
8020 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8021 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8022 if (((I + 1) >= EndIdx) ||
8023 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8024 I == DimIdx + NumGradients - 1))) {
8025 if (Addr.getValueType() != MVT::i16)
8026 Addr = DAG.getBitcast(MVT::i16, Addr);
8027 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8028 } else {
8029 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8030 I++;
8031 }
8032 Addr = DAG.getBitcast(MVT::f32, Addr);
8033 PackedAddrs.push_back(Addr);
8034 }
8035}
8036
8037SDValue SITargetLowering::lowerImage(SDValue Op,
8039 SelectionDAG &DAG, bool WithChain) const {
8040 SDLoc DL(Op);
8042 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8043 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8045 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8046 unsigned IntrOpcode = Intr->BaseOpcode;
8047 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8048 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8049 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8050
8051 SmallVector<EVT, 3> ResultTypes(Op->values());
8052 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8053 bool IsD16 = false;
8054 bool IsG16 = false;
8055 bool IsA16 = false;
8056 SDValue VData;
8057 int NumVDataDwords = 0;
8058 bool AdjustRetType = false;
8059 bool IsAtomicPacked16Bit = false;
8060
8061 // Offset of intrinsic arguments
8062 const unsigned ArgOffset = WithChain ? 2 : 1;
8063
8064 unsigned DMask;
8065 unsigned DMaskLanes = 0;
8066
8067 if (BaseOpcode->Atomic) {
8068 VData = Op.getOperand(2);
8069
8070 IsAtomicPacked16Bit =
8071 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8072 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8073
8074 bool Is64Bit = VData.getValueSizeInBits() == 64;
8075 if (BaseOpcode->AtomicX2) {
8076 SDValue VData2 = Op.getOperand(3);
8077 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8078 {VData, VData2});
8079 if (Is64Bit)
8080 VData = DAG.getBitcast(MVT::v4i32, VData);
8081
8082 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8083 DMask = Is64Bit ? 0xf : 0x3;
8084 NumVDataDwords = Is64Bit ? 4 : 2;
8085 } else {
8086 DMask = Is64Bit ? 0x3 : 0x1;
8087 NumVDataDwords = Is64Bit ? 2 : 1;
8088 }
8089 } else {
8090 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8091 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8092
8093 if (BaseOpcode->Store) {
8094 VData = Op.getOperand(2);
8095
8096 MVT StoreVT = VData.getSimpleValueType();
8097 if (StoreVT.getScalarType() == MVT::f16) {
8098 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8099 return Op; // D16 is unsupported for this instruction
8100
8101 IsD16 = true;
8102 VData = handleD16VData(VData, DAG, true);
8103 }
8104
8105 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8106 } else if (!BaseOpcode->NoReturn) {
8107 // Work out the num dwords based on the dmask popcount and underlying type
8108 // and whether packing is supported.
8109 MVT LoadVT = ResultTypes[0].getSimpleVT();
8110 if (LoadVT.getScalarType() == MVT::f16) {
8111 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8112 return Op; // D16 is unsupported for this instruction
8113
8114 IsD16 = true;
8115 }
8116
8117 // Confirm that the return type is large enough for the dmask specified
8118 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8119 (!LoadVT.isVector() && DMaskLanes > 1))
8120 return Op;
8121
8122 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8123 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8124 // instructions.
8125 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8126 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8127 NumVDataDwords = (DMaskLanes + 1) / 2;
8128 else
8129 NumVDataDwords = DMaskLanes;
8130
8131 AdjustRetType = true;
8132 }
8133 }
8134
8135 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8137
8138 // Check for 16 bit addresses or derivatives and pack if true.
8139 MVT VAddrVT =
8140 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8141 MVT VAddrScalarVT = VAddrVT.getScalarType();
8142 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8143 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8144
8145 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8146 VAddrScalarVT = VAddrVT.getScalarType();
8147 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8148 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8149
8150 // Push back extra arguments.
8151 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8152 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8153 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8154 // Special handling of bias when A16 is on. Bias is of type half but
8155 // occupies full 32-bit.
8156 SDValue Bias = DAG.getBuildVector(
8157 MVT::v2f16, DL,
8158 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8159 VAddrs.push_back(Bias);
8160 } else {
8161 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8162 "Bias needs to be converted to 16 bit in A16 mode");
8163 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8164 }
8165 }
8166
8167 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8168 // 16 bit gradients are supported, but are tied to the A16 control
8169 // so both gradients and addresses must be 16 bit
8170 LLVM_DEBUG(
8171 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8172 "require 16 bit args for both gradients and addresses");
8173 return Op;
8174 }
8175
8176 if (IsA16) {
8177 if (!ST->hasA16()) {
8178 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8179 "support 16 bit addresses\n");
8180 return Op;
8181 }
8182 }
8183
8184 // We've dealt with incorrect input so we know that if IsA16, IsG16
8185 // are set then we have to compress/pack operands (either address,
8186 // gradient or both)
8187 // In the case where a16 and gradients are tied (no G16 support) then we
8188 // have already verified that both IsA16 and IsG16 are true
8189 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8190 // Activate g16
8191 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8193 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8194 }
8195
8196 // Add gradients (packed or unpacked)
8197 if (IsG16) {
8198 // Pack the gradients
8199 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8200 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8201 ArgOffset + Intr->GradientStart,
8202 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8203 } else {
8204 for (unsigned I = ArgOffset + Intr->GradientStart;
8205 I < ArgOffset + Intr->CoordStart; I++)
8206 VAddrs.push_back(Op.getOperand(I));
8207 }
8208
8209 // Add addresses (packed or unpacked)
8210 if (IsA16) {
8211 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8212 ArgOffset + Intr->CoordStart, VAddrEnd,
8213 0 /* No gradients */);
8214 } else {
8215 // Add uncompressed address
8216 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8217 VAddrs.push_back(Op.getOperand(I));
8218 }
8219
8220 // If the register allocator cannot place the address registers contiguously
8221 // without introducing moves, then using the non-sequential address encoding
8222 // is always preferable, since it saves VALU instructions and is usually a
8223 // wash in terms of code size or even better.
8224 //
8225 // However, we currently have no way of hinting to the register allocator that
8226 // MIMG addresses should be placed contiguously when it is possible to do so,
8227 // so force non-NSA for the common 2-address case as a heuristic.
8228 //
8229 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8230 // allocation when possible.
8231 //
8232 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8233 // set of the remaining addresses.
8234 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8235 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8236 const bool UseNSA = ST->hasNSAEncoding() &&
8237 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8238 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8239 const bool UsePartialNSA =
8240 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8241
8242 SDValue VAddr;
8243 if (UsePartialNSA) {
8244 VAddr = getBuildDwordsVector(DAG, DL,
8245 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8246 } else if (!UseNSA) {
8247 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8248 }
8249
8250 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8251 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8252 SDValue Unorm;
8253 if (!BaseOpcode->Sampler) {
8254 Unorm = True;
8255 } else {
8256 uint64_t UnormConst =
8257 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8258
8259 Unorm = UnormConst ? True : False;
8260 }
8261
8262 SDValue TFE;
8263 SDValue LWE;
8264 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8265 bool IsTexFail = false;
8266 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8267 return Op;
8268
8269 if (IsTexFail) {
8270 if (!DMaskLanes) {
8271 // Expecting to get an error flag since TFC is on - and dmask is 0
8272 // Force dmask to be at least 1 otherwise the instruction will fail
8273 DMask = 0x1;
8274 DMaskLanes = 1;
8275 NumVDataDwords = 1;
8276 }
8277 NumVDataDwords += 1;
8278 AdjustRetType = true;
8279 }
8280
8281 // Has something earlier tagged that the return type needs adjusting
8282 // This happens if the instruction is a load or has set TexFailCtrl flags
8283 if (AdjustRetType) {
8284 // NumVDataDwords reflects the true number of dwords required in the return
8285 // type
8286 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8287 // This is a no-op load. This can be eliminated
8288 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8289 if (isa<MemSDNode>(Op))
8290 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8291 return Undef;
8292 }
8293
8294 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8295 MVT::i32, NumVDataDwords)
8296 : MVT::i32;
8297
8298 ResultTypes[0] = NewVT;
8299 if (ResultTypes.size() == 3) {
8300 // Original result was aggregate type used for TexFailCtrl results
8301 // The actual instruction returns as a vector type which has now been
8302 // created. Remove the aggregate result.
8303 ResultTypes.erase(&ResultTypes[1]);
8304 }
8305 }
8306
8307 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8308 if (BaseOpcode->Atomic)
8309 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8310 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8312 return Op;
8313
8315 if (BaseOpcode->Store || BaseOpcode->Atomic)
8316 Ops.push_back(VData); // vdata
8317 if (UsePartialNSA) {
8318 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8319 Ops.push_back(VAddr);
8320 } else if (UseNSA)
8321 append_range(Ops, VAddrs);
8322 else
8323 Ops.push_back(VAddr);
8324 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8325 EVT RsrcVT = Rsrc.getValueType();
8326 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8327 return Op;
8328 Ops.push_back(Rsrc);
8329 if (BaseOpcode->Sampler) {
8330 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8331 if (Samp.getValueType() != MVT::v4i32)
8332 return Op;
8333 Ops.push_back(Samp);
8334 }
8335 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8336 if (IsGFX10Plus)
8337 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8338 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8339 Ops.push_back(Unorm);
8340 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8341 Ops.push_back(IsA16 && // r128, a16 for gfx9
8342 ST->hasFeature(AMDGPU::FeatureR128A16)
8343 ? True
8344 : False);
8345 if (IsGFX10Plus)
8346 Ops.push_back(IsA16 ? True : False);
8347 if (!Subtarget->hasGFX90AInsts()) {
8348 Ops.push_back(TFE); // tfe
8349 } else if (TFE->getAsZExtVal()) {
8350 report_fatal_error("TFE is not supported on this GPU");
8351 }
8352 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8353 Ops.push_back(LWE); // lwe
8354 if (!IsGFX10Plus)
8355 Ops.push_back(DimInfo->DA ? True : False);
8356 if (BaseOpcode->HasD16)
8357 Ops.push_back(IsD16 ? True : False);
8358 if (isa<MemSDNode>(Op))
8359 Ops.push_back(Op.getOperand(0)); // chain
8360
8361 int NumVAddrDwords =
8362 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8363 int Opcode = -1;
8364
8365 if (IsGFX12Plus) {
8366 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8367 NumVDataDwords, NumVAddrDwords);
8368 } else if (IsGFX11Plus) {
8369 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8370 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8371 : AMDGPU::MIMGEncGfx11Default,
8372 NumVDataDwords, NumVAddrDwords);
8373 } else if (IsGFX10Plus) {
8374 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8375 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8376 : AMDGPU::MIMGEncGfx10Default,
8377 NumVDataDwords, NumVAddrDwords);
8378 } else {
8379 if (Subtarget->hasGFX90AInsts()) {
8380 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8381 NumVDataDwords, NumVAddrDwords);
8382 if (Opcode == -1)
8384 "requested image instruction is not supported on this GPU");
8385 }
8386 if (Opcode == -1 &&
8388 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8389 NumVDataDwords, NumVAddrDwords);
8390 if (Opcode == -1)
8391 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8392 NumVDataDwords, NumVAddrDwords);
8393 }
8394 if (Opcode == -1)
8395 return Op;
8396
8397 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8398 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8399 MachineMemOperand *MemRef = MemOp->getMemOperand();
8400 DAG.setNodeMemRefs(NewNode, {MemRef});
8401 }
8402
8403 if (BaseOpcode->AtomicX2) {
8405 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8406 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8407 }
8408 if (BaseOpcode->NoReturn)
8409 return SDValue(NewNode, 0);
8410 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8411 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8412 NumVDataDwords, IsAtomicPacked16Bit, DL);
8413}
8414
8415SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8416 SDValue Offset, SDValue CachePolicy,
8417 SelectionDAG &DAG) const {
8419
8420 const DataLayout &DataLayout = DAG.getDataLayout();
8421 Align Alignment =
8423
8428 VT.getStoreSize(), Alignment);
8429
8430 if (!Offset->isDivergent()) {
8431 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8432
8433 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8434 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8435 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8436 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8437 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8438 SDValue BufferLoad =
8440 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8441 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8442 }
8443
8444 // Widen vec3 load to vec4.
8445 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8446 !Subtarget->hasScalarDwordx3Loads()) {
8447 EVT WidenedVT =
8449 auto WidenedOp = DAG.getMemIntrinsicNode(
8450 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8451 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8452 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8453 DAG.getVectorIdxConstant(0, DL));
8454 return Subvector;
8455 }
8456
8458 DAG.getVTList(VT), Ops, VT, MMO);
8459 }
8460
8461 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8462 // assume that the buffer is unswizzled.
8463 SDValue Ops[] = {
8464 DAG.getEntryNode(), // Chain
8465 Rsrc, // rsrc
8466 DAG.getConstant(0, DL, MVT::i32), // vindex
8467 {}, // voffset
8468 {}, // soffset
8469 {}, // offset
8470 CachePolicy, // cachepolicy
8471 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8472 };
8473 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8474 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8475 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8476 }
8477
8479 unsigned NumLoads = 1;
8480 MVT LoadVT = VT.getSimpleVT();
8481 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8482 assert((LoadVT.getScalarType() == MVT::i32 ||
8483 LoadVT.getScalarType() == MVT::f32));
8484
8485 if (NumElts == 8 || NumElts == 16) {
8486 NumLoads = NumElts / 4;
8487 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8488 }
8489
8490 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8491
8492 // Use the alignment to ensure that the required offsets will fit into the
8493 // immediate offsets.
8494 setBufferOffsets(Offset, DAG, &Ops[3],
8495 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8496
8497 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8498 for (unsigned i = 0; i < NumLoads; ++i) {
8499 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8500 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8501 LoadVT, MMO, DAG));
8502 }
8503
8504 if (NumElts == 8 || NumElts == 16)
8505 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8506
8507 return Loads[0];
8508}
8509
8510SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8511 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8512 if (!Subtarget->hasArchitectedSGPRs())
8513 return {};
8514 SDLoc SL(Op);
8515 MVT VT = MVT::i32;
8516 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8517 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8518 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8519}
8520
8521SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8522 unsigned Dim,
8523 const ArgDescriptor &Arg) const {
8524 SDLoc SL(Op);
8526 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8527 if (MaxID == 0)
8528 return DAG.getConstant(0, SL, MVT::i32);
8529
8530 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8531 SDLoc(DAG.getEntryNode()), Arg);
8532
8533 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8534 // masking operations anyway.
8535 //
8536 // TODO: We could assert the top bit is 0 for the source copy.
8537 if (Arg.isMasked())
8538 return Val;
8539
8540 // Preserve the known bits after expansion to a copy.
8542 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8543 DAG.getValueType(SmallVT));
8544}
8545
8546SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8547 SelectionDAG &DAG) const {
8549 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8550
8551 EVT VT = Op.getValueType();
8552 SDLoc DL(Op);
8553 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8554
8555 // TODO: Should this propagate fast-math-flags?
8556
8557 switch (IntrinsicID) {
8558 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8559 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8560 return emitNonHSAIntrinsicError(DAG, DL, VT);
8561 return getPreloadedValue(DAG, *MFI, VT,
8563 }
8564 case Intrinsic::amdgcn_dispatch_ptr:
8565 case Intrinsic::amdgcn_queue_ptr: {
8566 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8567 DiagnosticInfoUnsupported BadIntrin(
8568 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8569 DL.getDebugLoc());
8570 DAG.getContext()->diagnose(BadIntrin);
8571 return DAG.getUNDEF(VT);
8572 }
8573
8574 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8577 return getPreloadedValue(DAG, *MFI, VT, RegID);
8578 }
8579 case Intrinsic::amdgcn_implicitarg_ptr: {
8580 if (MFI->isEntryFunction())
8581 return getImplicitArgPtr(DAG, DL);
8582 return getPreloadedValue(DAG, *MFI, VT,
8584 }
8585 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8587 // This only makes sense to call in a kernel, so just lower to null.
8588 return DAG.getConstant(0, DL, VT);
8589 }
8590
8591 return getPreloadedValue(DAG, *MFI, VT,
8593 }
8594 case Intrinsic::amdgcn_dispatch_id: {
8595 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8596 }
8597 case Intrinsic::amdgcn_rcp:
8598 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8599 case Intrinsic::amdgcn_rsq:
8600 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8601 case Intrinsic::amdgcn_rsq_legacy:
8603 return emitRemovedIntrinsicError(DAG, DL, VT);
8604 return SDValue();
8605 case Intrinsic::amdgcn_rcp_legacy:
8607 return emitRemovedIntrinsicError(DAG, DL, VT);
8608 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8609 case Intrinsic::amdgcn_rsq_clamp: {
8611 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8612
8613 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8616
8617 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8618 SDValue Tmp =
8619 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8620 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8621 DAG.getConstantFP(Min, DL, VT));
8622 }
8623 case Intrinsic::r600_read_ngroups_x:
8624 if (Subtarget->isAmdHsaOS())
8625 return emitNonHSAIntrinsicError(DAG, DL, VT);
8626
8627 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8629 false);
8630 case Intrinsic::r600_read_ngroups_y:
8631 if (Subtarget->isAmdHsaOS())
8632 return emitNonHSAIntrinsicError(DAG, DL, VT);
8633
8634 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8636 false);
8637 case Intrinsic::r600_read_ngroups_z:
8638 if (Subtarget->isAmdHsaOS())
8639 return emitNonHSAIntrinsicError(DAG, DL, VT);
8640
8641 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8643 false);
8644 case Intrinsic::r600_read_global_size_x:
8645 if (Subtarget->isAmdHsaOS())
8646 return emitNonHSAIntrinsicError(DAG, DL, VT);
8647
8648 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8650 Align(4), false);
8651 case Intrinsic::r600_read_global_size_y:
8652 if (Subtarget->isAmdHsaOS())
8653 return emitNonHSAIntrinsicError(DAG, DL, VT);
8654
8655 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8657 Align(4), false);
8658 case Intrinsic::r600_read_global_size_z:
8659 if (Subtarget->isAmdHsaOS())
8660 return emitNonHSAIntrinsicError(DAG, DL, VT);
8661
8662 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8664 Align(4), false);
8665 case Intrinsic::r600_read_local_size_x:
8666 if (Subtarget->isAmdHsaOS())
8667 return emitNonHSAIntrinsicError(DAG, DL, VT);
8668
8669 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8671 case Intrinsic::r600_read_local_size_y:
8672 if (Subtarget->isAmdHsaOS())
8673 return emitNonHSAIntrinsicError(DAG, DL, VT);
8674
8675 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8677 case Intrinsic::r600_read_local_size_z:
8678 if (Subtarget->isAmdHsaOS())
8679 return emitNonHSAIntrinsicError(DAG, DL, VT);
8680
8681 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8683 case Intrinsic::amdgcn_workgroup_id_x:
8684 return getPreloadedValue(DAG, *MFI, VT,
8686 case Intrinsic::amdgcn_workgroup_id_y:
8687 return getPreloadedValue(DAG, *MFI, VT,
8689 case Intrinsic::amdgcn_workgroup_id_z:
8690 return getPreloadedValue(DAG, *MFI, VT,
8692 case Intrinsic::amdgcn_wave_id:
8693 return lowerWaveID(DAG, Op);
8694 case Intrinsic::amdgcn_lds_kernel_id: {
8695 if (MFI->isEntryFunction())
8696 return getLDSKernelId(DAG, DL);
8697 return getPreloadedValue(DAG, *MFI, VT,
8699 }
8700 case Intrinsic::amdgcn_workitem_id_x:
8701 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8702 case Intrinsic::amdgcn_workitem_id_y:
8703 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8704 case Intrinsic::amdgcn_workitem_id_z:
8705 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8706 case Intrinsic::amdgcn_wavefrontsize:
8708 SDLoc(Op), MVT::i32);
8709 case Intrinsic::amdgcn_s_buffer_load: {
8710 unsigned CPol = Op.getConstantOperandVal(3);
8711 // s_buffer_load, because of how it's optimized, can't be volatile
8712 // so reject ones with the volatile bit set.
8713 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8716 return Op;
8717 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8718 Op.getOperand(3), DAG);
8719 }
8720 case Intrinsic::amdgcn_fdiv_fast:
8721 return lowerFDIV_FAST(Op, DAG);
8722 case Intrinsic::amdgcn_sin:
8723 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8724
8725 case Intrinsic::amdgcn_cos:
8726 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8727
8728 case Intrinsic::amdgcn_mul_u24:
8729 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8730 Op.getOperand(2));
8731 case Intrinsic::amdgcn_mul_i24:
8732 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8733 Op.getOperand(2));
8734
8735 case Intrinsic::amdgcn_log_clamp: {
8737 return SDValue();
8738
8739 return emitRemovedIntrinsicError(DAG, DL, VT);
8740 }
8741 case Intrinsic::amdgcn_fract:
8742 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8743
8744 case Intrinsic::amdgcn_class:
8745 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8746 Op.getOperand(2));
8747 case Intrinsic::amdgcn_div_fmas:
8748 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8749 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8750
8751 case Intrinsic::amdgcn_div_fixup:
8752 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8753 Op.getOperand(2), Op.getOperand(3));
8754
8755 case Intrinsic::amdgcn_div_scale: {
8756 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8757
8758 // Translate to the operands expected by the machine instruction. The
8759 // first parameter must be the same as the first instruction.
8760 SDValue Numerator = Op.getOperand(1);
8761 SDValue Denominator = Op.getOperand(2);
8762
8763 // Note this order is opposite of the machine instruction's operations,
8764 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8765 // intrinsic has the numerator as the first operand to match a normal
8766 // division operation.
8767
8768 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8769
8770 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8771 Denominator, Numerator);
8772 }
8773 case Intrinsic::amdgcn_icmp: {
8774 // There is a Pat that handles this variant, so return it as-is.
8775 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8776 Op.getConstantOperandVal(2) == 0 &&
8777 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8778 return Op;
8779 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8780 }
8781 case Intrinsic::amdgcn_fcmp: {
8782 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8783 }
8784 case Intrinsic::amdgcn_ballot:
8785 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8786 case Intrinsic::amdgcn_fmed3:
8787 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8788 Op.getOperand(2), Op.getOperand(3));
8789 case Intrinsic::amdgcn_fdot2:
8790 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8791 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8792 case Intrinsic::amdgcn_fmul_legacy:
8793 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8794 Op.getOperand(2));
8795 case Intrinsic::amdgcn_sffbh:
8796 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8797 case Intrinsic::amdgcn_sbfe:
8798 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8799 Op.getOperand(2), Op.getOperand(3));
8800 case Intrinsic::amdgcn_ubfe:
8801 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8802 Op.getOperand(2), Op.getOperand(3));
8803 case Intrinsic::amdgcn_cvt_pkrtz:
8804 case Intrinsic::amdgcn_cvt_pknorm_i16:
8805 case Intrinsic::amdgcn_cvt_pknorm_u16:
8806 case Intrinsic::amdgcn_cvt_pk_i16:
8807 case Intrinsic::amdgcn_cvt_pk_u16: {
8808 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8809 EVT VT = Op.getValueType();
8810 unsigned Opcode;
8811
8812 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8814 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8816 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8818 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8820 else
8822
8823 if (isTypeLegal(VT))
8824 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8825
8826 SDValue Node =
8827 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8828 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8829 }
8830 case Intrinsic::amdgcn_fmad_ftz:
8831 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8832 Op.getOperand(2), Op.getOperand(3));
8833
8834 case Intrinsic::amdgcn_if_break:
8835 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8836 Op->getOperand(1), Op->getOperand(2)),
8837 0);
8838
8839 case Intrinsic::amdgcn_groupstaticsize: {
8841 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8842 return Op;
8843
8844 const Module *M = MF.getFunction().getParent();
8845 const GlobalValue *GV =
8846 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8847 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8849 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8850 }
8851 case Intrinsic::amdgcn_is_shared:
8852 case Intrinsic::amdgcn_is_private: {
8853 SDLoc SL(Op);
8854 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8857 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8858 SDValue SrcVec =
8859 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8860
8861 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8862 DAG.getConstant(1, SL, MVT::i32));
8863 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8864 }
8865 case Intrinsic::amdgcn_perm:
8866 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8867 Op.getOperand(2), Op.getOperand(3));
8868 case Intrinsic::amdgcn_reloc_constant: {
8869 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8870 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8871 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8872 auto *RelocSymbol = cast<GlobalVariable>(
8873 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8874 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8876 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8877 }
8878 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8879 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8880 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8881 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8882 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8883 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8884 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8885 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8886 if (Op.getOperand(4).getValueType() == MVT::i32)
8887 return SDValue();
8888
8889 SDLoc SL(Op);
8890 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8891 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8892 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8893 Op.getOperand(3), IndexKeyi32);
8894 }
8895 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8896 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8897 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8898 if (Op.getOperand(6).getValueType() == MVT::i32)
8899 return SDValue();
8900
8901 SDLoc SL(Op);
8902 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8903 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8904 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8905 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8906 IndexKeyi32, Op.getOperand(7)});
8907 }
8908 case Intrinsic::amdgcn_addrspacecast_nonnull:
8909 return lowerADDRSPACECAST(Op, DAG);
8910 case Intrinsic::amdgcn_readlane:
8911 case Intrinsic::amdgcn_readfirstlane:
8912 case Intrinsic::amdgcn_writelane:
8913 case Intrinsic::amdgcn_permlane16:
8914 case Intrinsic::amdgcn_permlanex16:
8915 case Intrinsic::amdgcn_permlane64:
8916 case Intrinsic::amdgcn_set_inactive:
8917 case Intrinsic::amdgcn_set_inactive_chain_arg:
8918 case Intrinsic::amdgcn_mov_dpp8:
8919 case Intrinsic::amdgcn_update_dpp:
8920 return lowerLaneOp(*this, Op.getNode(), DAG);
8921 default:
8922 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8924 return lowerImage(Op, ImageDimIntr, DAG, false);
8925
8926 return Op;
8927 }
8928}
8929
8930// On targets not supporting constant in soffset field, turn zero to
8931// SGPR_NULL to avoid generating an extra s_mov with zero.
8933 const GCNSubtarget *Subtarget) {
8934 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8935 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8936 return SOffset;
8937}
8938
8939SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8940 SelectionDAG &DAG,
8941 unsigned NewOpcode) const {
8942 SDLoc DL(Op);
8943
8944 SDValue VData = Op.getOperand(2);
8945 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8946 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
8947 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8948 SDValue Ops[] = {
8949 Op.getOperand(0), // Chain
8950 VData, // vdata
8951 Rsrc, // rsrc
8952 DAG.getConstant(0, DL, MVT::i32), // vindex
8953 VOffset, // voffset
8954 SOffset, // soffset
8955 Offset, // offset
8956 Op.getOperand(6), // cachepolicy
8957 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8958 };
8959
8960 auto *M = cast<MemSDNode>(Op);
8961
8962 EVT MemVT = VData.getValueType();
8963 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8964 M->getMemOperand());
8965}
8966
8967SDValue
8968SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8969 unsigned NewOpcode) const {
8970 SDLoc DL(Op);
8971
8972 SDValue VData = Op.getOperand(2);
8973 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8974 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
8975 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8976 SDValue Ops[] = {
8977 Op.getOperand(0), // Chain
8978 VData, // vdata
8979 Rsrc, // rsrc
8980 Op.getOperand(4), // vindex
8981 VOffset, // voffset
8982 SOffset, // soffset
8983 Offset, // offset
8984 Op.getOperand(7), // cachepolicy
8985 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8986 };
8987
8988 auto *M = cast<MemSDNode>(Op);
8989
8990 EVT MemVT = VData.getValueType();
8991 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8992 M->getMemOperand());
8993}
8994
8995SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8996 SelectionDAG &DAG) const {
8997 unsigned IntrID = Op.getConstantOperandVal(1);
8998 SDLoc DL(Op);
8999
9000 switch (IntrID) {
9001 case Intrinsic::amdgcn_ds_ordered_add:
9002 case Intrinsic::amdgcn_ds_ordered_swap: {
9003 MemSDNode *M = cast<MemSDNode>(Op);
9004 SDValue Chain = M->getOperand(0);
9005 SDValue M0 = M->getOperand(2);
9006 SDValue Value = M->getOperand(3);
9007 unsigned IndexOperand = M->getConstantOperandVal(7);
9008 unsigned WaveRelease = M->getConstantOperandVal(8);
9009 unsigned WaveDone = M->getConstantOperandVal(9);
9010
9011 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9012 IndexOperand &= ~0x3f;
9013 unsigned CountDw = 0;
9014
9015 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9016 CountDw = (IndexOperand >> 24) & 0xf;
9017 IndexOperand &= ~(0xf << 24);
9018
9019 if (CountDw < 1 || CountDw > 4) {
9021 "ds_ordered_count: dword count must be between 1 and 4");
9022 }
9023 }
9024
9025 if (IndexOperand)
9026 report_fatal_error("ds_ordered_count: bad index operand");
9027
9028 if (WaveDone && !WaveRelease)
9029 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9030
9031 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9032 unsigned ShaderType =
9034 unsigned Offset0 = OrderedCountIndex << 2;
9035 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9036
9037 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9038 Offset1 |= (CountDw - 1) << 6;
9039
9040 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9041 Offset1 |= ShaderType << 2;
9042
9043 unsigned Offset = Offset0 | (Offset1 << 8);
9044
9045 SDValue Ops[] = {
9046 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9047 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9048 };
9050 M->getVTList(), Ops, M->getMemoryVT(),
9051 M->getMemOperand());
9052 }
9053 case Intrinsic::amdgcn_raw_buffer_load:
9054 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9055 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9056 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9057 case Intrinsic::amdgcn_raw_buffer_load_format:
9058 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9059 const bool IsFormat =
9060 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9061 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9062
9063 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9064 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9065 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9066 SDValue Ops[] = {
9067 Op.getOperand(0), // Chain
9068 Rsrc, // rsrc
9069 DAG.getConstant(0, DL, MVT::i32), // vindex
9070 VOffset, // voffset
9071 SOffset, // soffset
9072 Offset, // offset
9073 Op.getOperand(5), // cachepolicy, swizzled buffer
9074 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9075 };
9076
9077 auto *M = cast<MemSDNode>(Op);
9078 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9079 }
9080 case Intrinsic::amdgcn_struct_buffer_load:
9081 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9082 case Intrinsic::amdgcn_struct_buffer_load_format:
9083 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9084 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9085 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9086 const bool IsFormat =
9087 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9088 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9089
9090 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9091 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9092 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9093 SDValue Ops[] = {
9094 Op.getOperand(0), // Chain
9095 Rsrc, // rsrc
9096 Op.getOperand(3), // vindex
9097 VOffset, // voffset
9098 SOffset, // soffset
9099 Offset, // offset
9100 Op.getOperand(6), // cachepolicy, swizzled buffer
9101 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9102 };
9103
9104 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9105 }
9106 case Intrinsic::amdgcn_raw_tbuffer_load:
9107 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9108 MemSDNode *M = cast<MemSDNode>(Op);
9109 EVT LoadVT = Op.getValueType();
9110 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9111 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9112 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9113
9114 SDValue Ops[] = {
9115 Op.getOperand(0), // Chain
9116 Rsrc, // rsrc
9117 DAG.getConstant(0, DL, MVT::i32), // vindex
9118 VOffset, // voffset
9119 SOffset, // soffset
9120 Offset, // offset
9121 Op.getOperand(5), // format
9122 Op.getOperand(6), // cachepolicy, swizzled buffer
9123 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9124 };
9125
9126 if (LoadVT.getScalarType() == MVT::f16)
9127 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9128 Ops);
9129 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9130 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9131 DAG);
9132 }
9133 case Intrinsic::amdgcn_struct_tbuffer_load:
9134 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9135 MemSDNode *M = cast<MemSDNode>(Op);
9136 EVT LoadVT = Op.getValueType();
9137 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9138 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9139 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9140
9141 SDValue Ops[] = {
9142 Op.getOperand(0), // Chain
9143 Rsrc, // rsrc
9144 Op.getOperand(3), // vindex
9145 VOffset, // voffset
9146 SOffset, // soffset
9147 Offset, // offset
9148 Op.getOperand(6), // format
9149 Op.getOperand(7), // cachepolicy, swizzled buffer
9150 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9151 };
9152
9153 if (LoadVT.getScalarType() == MVT::f16)
9154 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9155 Ops);
9156 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9157 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9158 DAG);
9159 }
9160 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9162 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9163 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9164 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9165 return lowerStructBufferAtomicIntrin(Op, DAG,
9167 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9168 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9169 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9170 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9171 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9172 return lowerStructBufferAtomicIntrin(Op, DAG,
9174 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9175 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9176 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9177 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9178 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9179 return lowerStructBufferAtomicIntrin(Op, DAG,
9181 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9182 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9183 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9184 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9186 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9187 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9188 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9189 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9190 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9192 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9193 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9195 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9196 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9197 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9198 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9199 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9200 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9201 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9202 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9203 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9204 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9205 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9207 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9208 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9209 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9210 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9211 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9212 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9213 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9214 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9215 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9216 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9217 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9218 return lowerRawBufferAtomicIntrin(Op, DAG,
9220 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9222 return lowerStructBufferAtomicIntrin(Op, DAG,
9224 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9225 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9226 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9227 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9229 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9230 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9231 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9232 return lowerStructBufferAtomicIntrin(Op, DAG,
9234 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9235 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9236 return lowerStructBufferAtomicIntrin(Op, DAG,
9238 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9239 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9240 return lowerStructBufferAtomicIntrin(Op, DAG,
9242 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9243 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9244 return lowerStructBufferAtomicIntrin(Op, DAG,
9246 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9247 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9248 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9249 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9251 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9252 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9254 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9255 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9257 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9258 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9260 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9261 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9262 return lowerStructBufferAtomicIntrin(Op, DAG,
9264
9265 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9266 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9267 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9268 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9269 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9270 SDValue Ops[] = {
9271 Op.getOperand(0), // Chain
9272 Op.getOperand(2), // src
9273 Op.getOperand(3), // cmp
9274 Rsrc, // rsrc
9275 DAG.getConstant(0, DL, MVT::i32), // vindex
9276 VOffset, // voffset
9277 SOffset, // soffset
9278 Offset, // offset
9279 Op.getOperand(7), // cachepolicy
9280 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9281 };
9282 EVT VT = Op.getValueType();
9283 auto *M = cast<MemSDNode>(Op);
9284
9286 Op->getVTList(), Ops, VT,
9287 M->getMemOperand());
9288 }
9289 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9290 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9291 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9292 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9293 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9294 SDValue Ops[] = {
9295 Op.getOperand(0), // Chain
9296 Op.getOperand(2), // src
9297 Op.getOperand(3), // cmp
9298 Rsrc, // rsrc
9299 Op.getOperand(5), // vindex
9300 VOffset, // voffset
9301 SOffset, // soffset
9302 Offset, // offset
9303 Op.getOperand(8), // cachepolicy
9304 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9305 };
9306 EVT VT = Op.getValueType();
9307 auto *M = cast<MemSDNode>(Op);
9308
9310 Op->getVTList(), Ops, VT,
9311 M->getMemOperand());
9312 }
9313 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9314 MemSDNode *M = cast<MemSDNode>(Op);
9315 SDValue NodePtr = M->getOperand(2);
9316 SDValue RayExtent = M->getOperand(3);
9317 SDValue RayOrigin = M->getOperand(4);
9318 SDValue RayDir = M->getOperand(5);
9319 SDValue RayInvDir = M->getOperand(6);
9320 SDValue TDescr = M->getOperand(7);
9321
9322 assert(NodePtr.getValueType() == MVT::i32 ||
9323 NodePtr.getValueType() == MVT::i64);
9324 assert(RayDir.getValueType() == MVT::v3f16 ||
9325 RayDir.getValueType() == MVT::v3f32);
9326
9327 if (!Subtarget->hasGFX10_AEncoding()) {
9328 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9329 return SDValue();
9330 }
9331
9332 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9333 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9334 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9335 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9336 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9337 const unsigned NumVDataDwords = 4;
9338 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9339 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9340 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9341 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9342 IsGFX12Plus;
9343 const unsigned BaseOpcodes[2][2] = {
9344 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9345 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9346 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9347 int Opcode;
9348 if (UseNSA) {
9349 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9350 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9351 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9352 : AMDGPU::MIMGEncGfx10NSA,
9353 NumVDataDwords, NumVAddrDwords);
9354 } else {
9355 assert(!IsGFX12Plus);
9356 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9357 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9358 : AMDGPU::MIMGEncGfx10Default,
9359 NumVDataDwords, NumVAddrDwords);
9360 }
9361 assert(Opcode != -1);
9362
9364
9365 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9367 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9368 if (Lanes[0].getValueSizeInBits() == 32) {
9369 for (unsigned I = 0; I < 3; ++I)
9370 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9371 } else {
9372 if (IsAligned) {
9373 Ops.push_back(DAG.getBitcast(
9374 MVT::i32,
9375 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9376 Ops.push_back(Lanes[2]);
9377 } else {
9378 SDValue Elt0 = Ops.pop_back_val();
9379 Ops.push_back(DAG.getBitcast(
9380 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9381 Ops.push_back(DAG.getBitcast(
9382 MVT::i32,
9383 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9384 }
9385 }
9386 };
9387
9388 if (UseNSA && IsGFX11Plus) {
9389 Ops.push_back(NodePtr);
9390 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9391 Ops.push_back(RayOrigin);
9392 if (IsA16) {
9393 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9394 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9395 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9396 for (unsigned I = 0; I < 3; ++I) {
9397 MergedLanes.push_back(DAG.getBitcast(
9398 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9399 {DirLanes[I], InvDirLanes[I]})));
9400 }
9401 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9402 } else {
9403 Ops.push_back(RayDir);
9404 Ops.push_back(RayInvDir);
9405 }
9406 } else {
9407 if (Is64)
9408 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9409 2);
9410 else
9411 Ops.push_back(NodePtr);
9412
9413 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9414 packLanes(RayOrigin, true);
9415 packLanes(RayDir, true);
9416 packLanes(RayInvDir, false);
9417 }
9418
9419 if (!UseNSA) {
9420 // Build a single vector containing all the operands so far prepared.
9421 if (NumVAddrDwords > 12) {
9422 SDValue Undef = DAG.getUNDEF(MVT::i32);
9423 Ops.append(16 - Ops.size(), Undef);
9424 }
9425 assert(Ops.size() >= 8 && Ops.size() <= 12);
9426 SDValue MergedOps =
9427 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9428 Ops.clear();
9429 Ops.push_back(MergedOps);
9430 }
9431
9432 Ops.push_back(TDescr);
9433 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9434 Ops.push_back(M->getChain());
9435
9436 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9437 MachineMemOperand *MemRef = M->getMemOperand();
9438 DAG.setNodeMemRefs(NewNode, {MemRef});
9439 return SDValue(NewNode, 0);
9440 }
9441 case Intrinsic::amdgcn_global_atomic_fmin_num:
9442 case Intrinsic::amdgcn_global_atomic_fmax_num:
9443 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9444 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9445 MemSDNode *M = cast<MemSDNode>(Op);
9446 SDValue Ops[] = {
9447 M->getOperand(0), // Chain
9448 M->getOperand(2), // Ptr
9449 M->getOperand(3) // Value
9450 };
9451 unsigned Opcode = 0;
9452 switch (IntrID) {
9453 case Intrinsic::amdgcn_global_atomic_fmin_num:
9454 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9455 Opcode = ISD::ATOMIC_LOAD_FMIN;
9456 break;
9457 }
9458 case Intrinsic::amdgcn_global_atomic_fmax_num:
9459 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9460 Opcode = ISD::ATOMIC_LOAD_FMAX;
9461 break;
9462 }
9463 default:
9464 llvm_unreachable("unhandled atomic opcode");
9465 }
9466 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9467 Ops, M->getMemOperand());
9468 }
9469 case Intrinsic::amdgcn_s_get_barrier_state:
9470 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9471 SDValue Chain = Op->getOperand(0);
9473 unsigned Opc;
9474
9475 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9476 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9477 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9478 BarID = (BarID >> 4) & 0x3F;
9479 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9480 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9481 Ops.push_back(K);
9482 Ops.push_back(Chain);
9483 } else {
9484 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9485 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9486 SDValue M0Val;
9487 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9488 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9489 M0Val = SDValue(
9490 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9491 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9492 0);
9493 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9494 } else
9495 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9496 }
9497
9498 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9499 return SDValue(NewMI, 0);
9500 }
9501 default:
9502
9503 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9505 return lowerImage(Op, ImageDimIntr, DAG, true);
9506
9507 return SDValue();
9508 }
9509}
9510
9511// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9512// dwordx4 if on SI and handle TFE loads.
9513SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9514 SDVTList VTList,
9515 ArrayRef<SDValue> Ops, EVT MemVT,
9516 MachineMemOperand *MMO,
9517 SelectionDAG &DAG) const {
9518 LLVMContext &C = *DAG.getContext();
9520 EVT VT = VTList.VTs[0];
9521
9522 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9523 bool IsTFE = VTList.NumVTs == 3;
9524 if (IsTFE) {
9525 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9526 unsigned NumOpDWords = NumValueDWords + 1;
9527 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9528 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9529 MachineMemOperand *OpDWordsMMO =
9530 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9531 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9532 OpDWordsVT, OpDWordsMMO, DAG);
9534 DAG.getVectorIdxConstant(NumValueDWords, DL));
9535 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9536 SDValue ValueDWords =
9537 NumValueDWords == 1
9538 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9540 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9541 ZeroIdx);
9542 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9543 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9544 }
9545
9546 if (!Subtarget->hasDwordx3LoadStores() &&
9547 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9548 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9549 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9550 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9551 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9552 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9553 WidenedMemVT, WidenedMMO);
9555 DAG.getVectorIdxConstant(0, DL));
9556 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9557 }
9558
9559 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9560}
9561
9562SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9563 bool ImageStore) const {
9564 EVT StoreVT = VData.getValueType();
9565
9566 // No change for f16 and legal vector D16 types.
9567 if (!StoreVT.isVector())
9568 return VData;
9569
9570 SDLoc DL(VData);
9571 unsigned NumElements = StoreVT.getVectorNumElements();
9572
9573 if (Subtarget->hasUnpackedD16VMem()) {
9574 // We need to unpack the packed data to store.
9575 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9576 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9577
9578 EVT EquivStoreVT =
9579 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9580 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9581 return DAG.UnrollVectorOp(ZExt.getNode());
9582 }
9583
9584 // The sq block of gfx8.1 does not estimate register use correctly for d16
9585 // image store instructions. The data operand is computed as if it were not a
9586 // d16 image instruction.
9587 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9588 // Bitcast to i16
9589 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9590 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9591
9592 // Decompose into scalars
9594 DAG.ExtractVectorElements(IntVData, Elts);
9595
9596 // Group pairs of i16 into v2i16 and bitcast to i32
9597 SmallVector<SDValue, 4> PackedElts;
9598 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9599 SDValue Pair =
9600 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9601 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9602 PackedElts.push_back(IntPair);
9603 }
9604 if ((NumElements % 2) == 1) {
9605 // Handle v3i16
9606 unsigned I = Elts.size() / 2;
9607 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9608 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9609 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9610 PackedElts.push_back(IntPair);
9611 }
9612
9613 // Pad using UNDEF
9614 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9615
9616 // Build final vector
9617 EVT VecVT =
9618 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9619 return DAG.getBuildVector(VecVT, DL, PackedElts);
9620 }
9621
9622 if (NumElements == 3) {
9623 EVT IntStoreVT =
9625 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9626
9627 EVT WidenedStoreVT = EVT::getVectorVT(
9628 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9629 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9630 WidenedStoreVT.getStoreSizeInBits());
9631 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9632 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9633 }
9634
9635 assert(isTypeLegal(StoreVT));
9636 return VData;
9637}
9638
9639SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9640 SelectionDAG &DAG) const {
9641 SDLoc DL(Op);
9642 SDValue Chain = Op.getOperand(0);
9643 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9645
9646 switch (IntrinsicID) {
9647 case Intrinsic::amdgcn_exp_compr: {
9648 if (!Subtarget->hasCompressedExport()) {
9649 DiagnosticInfoUnsupported BadIntrin(
9651 "intrinsic not supported on subtarget", DL.getDebugLoc());
9652 DAG.getContext()->diagnose(BadIntrin);
9653 }
9654 SDValue Src0 = Op.getOperand(4);
9655 SDValue Src1 = Op.getOperand(5);
9656 // Hack around illegal type on SI by directly selecting it.
9657 if (isTypeLegal(Src0.getValueType()))
9658 return SDValue();
9659
9660 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9661 SDValue Undef = DAG.getUNDEF(MVT::f32);
9662 const SDValue Ops[] = {
9663 Op.getOperand(2), // tgt
9664 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9665 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9666 Undef, // src2
9667 Undef, // src3
9668 Op.getOperand(7), // vm
9669 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9670 Op.getOperand(3), // en
9671 Op.getOperand(0) // Chain
9672 };
9673
9674 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9675 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9676 }
9677 case Intrinsic::amdgcn_s_barrier:
9678 case Intrinsic::amdgcn_s_barrier_signal:
9679 case Intrinsic::amdgcn_s_barrier_wait: {
9682 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9683 if (WGSize <= ST.getWavefrontSize()) {
9684 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9685 // s_barrier/s_barrier_wait to wave_barrier.
9686 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9687 return Op.getOperand(0);
9688 else
9689 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9690 MVT::Other, Op.getOperand(0)),
9691 0);
9692 }
9693 }
9694
9695 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9696 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9697 SDValue K =
9699 SDValue BarSignal =
9700 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9701 MVT::Other, K, Op.getOperand(0)),
9702 0);
9703 SDValue BarWait =
9704 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9705 BarSignal.getValue(0)),
9706 0);
9707 return BarWait;
9708 }
9709
9710 return SDValue();
9711 };
9712
9713 case Intrinsic::amdgcn_struct_tbuffer_store:
9714 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9715 SDValue VData = Op.getOperand(2);
9716 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9717 if (IsD16)
9718 VData = handleD16VData(VData, DAG);
9719 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9720 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9721 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9722 SDValue Ops[] = {
9723 Chain,
9724 VData, // vdata
9725 Rsrc, // rsrc
9726 Op.getOperand(4), // vindex
9727 VOffset, // voffset
9728 SOffset, // soffset
9729 Offset, // offset
9730 Op.getOperand(7), // format
9731 Op.getOperand(8), // cachepolicy, swizzled buffer
9732 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9733 };
9734 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9736 MemSDNode *M = cast<MemSDNode>(Op);
9737 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9738 M->getMemoryVT(), M->getMemOperand());
9739 }
9740
9741 case Intrinsic::amdgcn_raw_tbuffer_store:
9742 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9743 SDValue VData = Op.getOperand(2);
9744 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9745 if (IsD16)
9746 VData = handleD16VData(VData, DAG);
9747 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9748 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9749 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9750 SDValue Ops[] = {
9751 Chain,
9752 VData, // vdata
9753 Rsrc, // rsrc
9754 DAG.getConstant(0, DL, MVT::i32), // vindex
9755 VOffset, // voffset
9756 SOffset, // soffset
9757 Offset, // offset
9758 Op.getOperand(6), // format
9759 Op.getOperand(7), // cachepolicy, swizzled buffer
9760 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9761 };
9762 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9764 MemSDNode *M = cast<MemSDNode>(Op);
9765 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9766 M->getMemoryVT(), M->getMemOperand());
9767 }
9768
9769 case Intrinsic::amdgcn_raw_buffer_store:
9770 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9771 case Intrinsic::amdgcn_raw_buffer_store_format:
9772 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9773 const bool IsFormat =
9774 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9775 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9776
9777 SDValue VData = Op.getOperand(2);
9778 EVT VDataVT = VData.getValueType();
9779 EVT EltType = VDataVT.getScalarType();
9780 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9781 if (IsD16) {
9782 VData = handleD16VData(VData, DAG);
9783 VDataVT = VData.getValueType();
9784 }
9785
9786 if (!isTypeLegal(VDataVT)) {
9787 VData =
9788 DAG.getNode(ISD::BITCAST, DL,
9789 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9790 }
9791
9792 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9793 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9794 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9795 SDValue Ops[] = {
9796 Chain,
9797 VData,
9798 Rsrc,
9799 DAG.getConstant(0, DL, MVT::i32), // vindex
9800 VOffset, // voffset
9801 SOffset, // soffset
9802 Offset, // offset
9803 Op.getOperand(6), // cachepolicy, swizzled buffer
9804 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9805 };
9806 unsigned Opc =
9808 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9809 MemSDNode *M = cast<MemSDNode>(Op);
9810
9811 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9812 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9813 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9814
9815 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9816 M->getMemoryVT(), M->getMemOperand());
9817 }
9818
9819 case Intrinsic::amdgcn_struct_buffer_store:
9820 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9821 case Intrinsic::amdgcn_struct_buffer_store_format:
9822 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9823 const bool IsFormat =
9824 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9825 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9826
9827 SDValue VData = Op.getOperand(2);
9828 EVT VDataVT = VData.getValueType();
9829 EVT EltType = VDataVT.getScalarType();
9830 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9831
9832 if (IsD16) {
9833 VData = handleD16VData(VData, DAG);
9834 VDataVT = VData.getValueType();
9835 }
9836
9837 if (!isTypeLegal(VDataVT)) {
9838 VData =
9839 DAG.getNode(ISD::BITCAST, DL,
9840 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9841 }
9842
9843 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9844 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9845 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9846 SDValue Ops[] = {
9847 Chain,
9848 VData,
9849 Rsrc,
9850 Op.getOperand(4), // vindex
9851 VOffset, // voffset
9852 SOffset, // soffset
9853 Offset, // offset
9854 Op.getOperand(7), // cachepolicy, swizzled buffer
9855 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9856 };
9857 unsigned Opc =
9859 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9860 MemSDNode *M = cast<MemSDNode>(Op);
9861
9862 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9863 EVT VDataType = VData.getValueType().getScalarType();
9864 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9865 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9866
9867 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9868 M->getMemoryVT(), M->getMemOperand());
9869 }
9870 case Intrinsic::amdgcn_raw_buffer_load_lds:
9871 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9872 case Intrinsic::amdgcn_struct_buffer_load_lds:
9873 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9874 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9875 unsigned Opc;
9876 bool HasVIndex =
9877 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9878 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9879 unsigned OpOffset = HasVIndex ? 1 : 0;
9880 SDValue VOffset = Op.getOperand(5 + OpOffset);
9881 bool HasVOffset = !isNullConstant(VOffset);
9882 unsigned Size = Op->getConstantOperandVal(4);
9883
9884 switch (Size) {
9885 default:
9886 return SDValue();
9887 case 1:
9888 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9889 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9890 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9891 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9892 break;
9893 case 2:
9894 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9895 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9896 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9897 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9898 break;
9899 case 4:
9900 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9901 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9902 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9903 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9904 break;
9905 case 12:
9906 if (!Subtarget->hasLDSLoadB96_B128())
9907 return SDValue();
9908 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9909 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9910 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9911 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9912 break;
9913 case 16:
9914 if (!Subtarget->hasLDSLoadB96_B128())
9915 return SDValue();
9916 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9917 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9918 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9919 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9920 break;
9921 }
9922
9923 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9924
9926
9927 if (HasVIndex && HasVOffset)
9928 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9929 {Op.getOperand(5), // VIndex
9930 VOffset}));
9931 else if (HasVIndex)
9932 Ops.push_back(Op.getOperand(5));
9933 else if (HasVOffset)
9934 Ops.push_back(VOffset);
9935
9936 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9937 Ops.push_back(Rsrc);
9938 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9939 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9940 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9941 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9943 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9944 DL, MVT::i8)); // cpol
9946 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9947 ? 1
9948 : 0,
9949 DL, MVT::i8)); // swz
9950 Ops.push_back(M0Val.getValue(0)); // Chain
9951 Ops.push_back(M0Val.getValue(1)); // Glue
9952
9953 auto *M = cast<MemSDNode>(Op);
9954 MachineMemOperand *LoadMMO = M->getMemOperand();
9955 // Don't set the offset value here because the pointer points to the base of
9956 // the buffer.
9957 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9958
9959 MachinePointerInfo StorePtrI = LoadPtrI;
9960 LoadPtrI.V = PoisonValue::get(
9964
9965 auto F = LoadMMO->getFlags() &
9967 LoadMMO =
9969 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9970
9972 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9973 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9974
9975 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9976 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9977
9978 return SDValue(Load, 0);
9979 }
9980 case Intrinsic::amdgcn_global_load_lds: {
9981 unsigned Opc;
9982 unsigned Size = Op->getConstantOperandVal(4);
9983 switch (Size) {
9984 default:
9985 return SDValue();
9986 case 1:
9987 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9988 break;
9989 case 2:
9990 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9991 break;
9992 case 4:
9993 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9994 break;
9995 case 12:
9996 if (!Subtarget->hasLDSLoadB96_B128())
9997 return SDValue();
9998 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
9999 break;
10000 case 16:
10001 if (!Subtarget->hasLDSLoadB96_B128())
10002 return SDValue();
10003 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10004 break;
10005 }
10006
10007 auto *M = cast<MemSDNode>(Op);
10008 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10009
10011
10012 SDValue Addr = Op.getOperand(2); // Global ptr
10013 SDValue VOffset;
10014 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10015 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10016 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10017 SDValue LHS = Addr.getOperand(0);
10018 SDValue RHS = Addr.getOperand(1);
10019
10020 if (LHS->isDivergent())
10021 std::swap(LHS, RHS);
10022
10023 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10024 RHS.getOperand(0).getValueType() == MVT::i32) {
10025 // add (i64 sgpr), (zero_extend (i32 vgpr))
10026 Addr = LHS;
10027 VOffset = RHS.getOperand(0);
10028 }
10029 }
10030
10031 Ops.push_back(Addr);
10032 if (!Addr->isDivergent()) {
10033 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10034 if (!VOffset)
10035 VOffset =
10036 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10037 DAG.getTargetConstant(0, DL, MVT::i32)),
10038 0);
10039 Ops.push_back(VOffset);
10040 }
10041
10042 Ops.push_back(Op.getOperand(5)); // Offset
10043 Ops.push_back(Op.getOperand(6)); // CPol
10044 Ops.push_back(M0Val.getValue(0)); // Chain
10045 Ops.push_back(M0Val.getValue(1)); // Glue
10046
10047 MachineMemOperand *LoadMMO = M->getMemOperand();
10048 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10049 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10050 MachinePointerInfo StorePtrI = LoadPtrI;
10051 LoadPtrI.V = PoisonValue::get(
10055 auto F = LoadMMO->getFlags() &
10057 LoadMMO =
10059 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10061 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10062 LoadMMO->getAAInfo());
10063
10064 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10065 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10066
10067 return SDValue(Load, 0);
10068 }
10069 case Intrinsic::amdgcn_end_cf:
10070 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10071 Op->getOperand(2), Chain),
10072 0);
10073 case Intrinsic::amdgcn_s_barrier_init:
10074 case Intrinsic::amdgcn_s_barrier_signal_var: {
10075 // these two intrinsics have two operands: barrier pointer and member count
10076 SDValue Chain = Op->getOperand(0);
10078 SDValue BarOp = Op->getOperand(2);
10079 SDValue CntOp = Op->getOperand(3);
10080 SDValue M0Val;
10081 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10082 ? AMDGPU::S_BARRIER_INIT_M0
10083 : AMDGPU::S_BARRIER_SIGNAL_M0;
10084 // extract the BarrierID from bits 4-9 of BarOp
10085 SDValue BarID;
10086 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10087 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10088 BarID =
10089 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10090 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10091 0);
10092 // Member count should be put into M0[ShAmt:+6]
10093 // Barrier ID should be put into M0[5:0]
10094 M0Val =
10095 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10096 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10097 0);
10098 constexpr unsigned ShAmt = 16;
10099 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10100 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10101
10102 M0Val = SDValue(
10103 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10104
10105 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10106
10107 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10108 return SDValue(NewMI, 0);
10109 }
10110 case Intrinsic::amdgcn_s_barrier_join: {
10111 // these three intrinsics have one operand: barrier pointer
10112 SDValue Chain = Op->getOperand(0);
10114 SDValue BarOp = Op->getOperand(2);
10115 unsigned Opc;
10116
10117 if (isa<ConstantSDNode>(BarOp)) {
10118 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10119 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10120
10121 // extract the BarrierID from bits 4-9 of the immediate
10122 unsigned BarID = (BarVal >> 4) & 0x3F;
10123 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10124 Ops.push_back(K);
10125 Ops.push_back(Chain);
10126 } else {
10127 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10128
10129 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10130 SDValue M0Val;
10131 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10132 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10133 M0Val =
10134 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10135 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10136 0);
10137 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10138 }
10139
10140 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10141 return SDValue(NewMI, 0);
10142 }
10143 case Intrinsic::amdgcn_s_prefetch_data: {
10144 // For non-global address space preserve the chain and remove the call.
10145 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10146 return Op.getOperand(0);
10147 return Op;
10148 }
10149 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10150 SDValue Ops[] = {
10151 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10152 Op.getOperand(3), // offset
10153 Op.getOperand(4), // length
10154 };
10155
10156 MemSDNode *M = cast<MemSDNode>(Op);
10158 Op->getVTList(), Ops, M->getMemoryVT(),
10159 M->getMemOperand());
10160 }
10161 default: {
10162 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10164 return lowerImage(Op, ImageDimIntr, DAG, true);
10165
10166 return Op;
10167 }
10168 }
10169}
10170
10171// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10172// offset (the offset that is included in bounds checking and swizzling, to be
10173// split between the instruction's voffset and immoffset fields) and soffset
10174// (the offset that is excluded from bounds checking and swizzling, to go in
10175// the instruction's soffset field). This function takes the first kind of
10176// offset and figures out how to split it between voffset and immoffset.
10177std::pair<SDValue, SDValue>
10178SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10179 SDLoc DL(Offset);
10180 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10181 SDValue N0 = Offset;
10182 ConstantSDNode *C1 = nullptr;
10183
10184 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10185 N0 = SDValue();
10186 else if (DAG.isBaseWithConstantOffset(N0)) {
10187 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10188 N0 = N0.getOperand(0);
10189 }
10190
10191 if (C1) {
10192 unsigned ImmOffset = C1->getZExtValue();
10193 // If the immediate value is too big for the immoffset field, put only bits
10194 // that would normally fit in the immoffset field. The remaining value that
10195 // is copied/added for the voffset field is a large power of 2, and it
10196 // stands more chance of being CSEd with the copy/add for another similar
10197 // load/store.
10198 // However, do not do that rounding down if that is a negative
10199 // number, as it appears to be illegal to have a negative offset in the
10200 // vgpr, even if adding the immediate offset makes it positive.
10201 unsigned Overflow = ImmOffset & ~MaxImm;
10202 ImmOffset -= Overflow;
10203 if ((int32_t)Overflow < 0) {
10204 Overflow += ImmOffset;
10205 ImmOffset = 0;
10206 }
10207 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10208 if (Overflow) {
10209 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10210 if (!N0)
10211 N0 = OverflowVal;
10212 else {
10213 SDValue Ops[] = {N0, OverflowVal};
10214 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10215 }
10216 }
10217 }
10218 if (!N0)
10219 N0 = DAG.getConstant(0, DL, MVT::i32);
10220 if (!C1)
10221 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10222 return {N0, SDValue(C1, 0)};
10223}
10224
10225// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10226// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10227// pointed to by Offsets.
10228void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10229 SelectionDAG &DAG, SDValue *Offsets,
10230 Align Alignment) const {
10232 SDLoc DL(CombinedOffset);
10233 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10234 uint32_t Imm = C->getZExtValue();
10235 uint32_t SOffset, ImmOffset;
10236 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10237 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10238 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10239 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10240 return;
10241 }
10242 }
10243 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10244 SDValue N0 = CombinedOffset.getOperand(0);
10245 SDValue N1 = CombinedOffset.getOperand(1);
10246 uint32_t SOffset, ImmOffset;
10247 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10248 if (Offset >= 0 &&
10249 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10250 Offsets[0] = N0;
10251 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10252 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10253 return;
10254 }
10255 }
10256
10257 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10258 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10259 : DAG.getConstant(0, DL, MVT::i32);
10260
10261 Offsets[0] = CombinedOffset;
10262 Offsets[1] = SOffsetZero;
10263 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10264}
10265
10266SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10267 SelectionDAG &DAG) const {
10268 if (!MaybePointer.getValueType().isScalarInteger())
10269 return MaybePointer;
10270
10271 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10272 return Rsrc;
10273}
10274
10275// Wrap a global or flat pointer into a buffer intrinsic using the flags
10276// specified in the intrinsic.
10277SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10278 SelectionDAG &DAG) const {
10279 SDLoc Loc(Op);
10280
10281 SDValue Pointer = Op->getOperand(1);
10282 SDValue Stride = Op->getOperand(2);
10283 SDValue NumRecords = Op->getOperand(3);
10284 SDValue Flags = Op->getOperand(4);
10285
10286 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10287 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10288 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10289 std::optional<uint32_t> ConstStride = std::nullopt;
10290 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10291 ConstStride = ConstNode->getZExtValue();
10292
10293 SDValue NewHighHalf = Masked;
10294 if (!ConstStride || *ConstStride != 0) {
10295 SDValue ShiftedStride;
10296 if (ConstStride) {
10297 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10298 } else {
10299 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10300 ShiftedStride =
10301 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10302 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10303 }
10304 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10305 }
10306
10307 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10308 NewHighHalf, NumRecords, Flags);
10309 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10310 return RsrcPtr;
10311}
10312
10313// Handle 8 bit and 16 bit buffer loads
10314SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10315 EVT LoadVT, SDLoc DL,
10317 MachineMemOperand *MMO,
10318 bool IsTFE) const {
10319 EVT IntVT = LoadVT.changeTypeToInteger();
10320
10321 if (IsTFE) {
10322 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10326 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10327 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10328 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10330 DAG.getConstant(1, DL, MVT::i32));
10331 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10332 DAG.getConstant(0, DL, MVT::i32));
10333 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10334 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10335 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10336 }
10337
10338 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10341
10342 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10343 SDValue BufferLoad =
10344 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10345 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10346 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10347
10348 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10349}
10350
10351// Handle 8 bit and 16 bit buffer stores
10352SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10353 EVT VDataType, SDLoc DL,
10354 SDValue Ops[],
10355 MemSDNode *M) const {
10356 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10357 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10358
10359 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10360 Ops[1] = BufferStoreExt;
10361 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10363 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10364 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10365 M->getMemOperand());
10366}
10367
10369 SDValue Op, const SDLoc &SL, EVT VT) {
10370 if (VT.bitsLT(Op.getValueType()))
10371 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10372
10373 switch (ExtType) {
10374 case ISD::SEXTLOAD:
10375 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10376 case ISD::ZEXTLOAD:
10377 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10378 case ISD::EXTLOAD:
10379 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10380 case ISD::NON_EXTLOAD:
10381 return Op;
10382 }
10383
10384 llvm_unreachable("invalid ext type");
10385}
10386
10387// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10388// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10389SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10390 DAGCombinerInfo &DCI) const {
10391 SelectionDAG &DAG = DCI.DAG;
10392 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10393 return SDValue();
10394
10395 // FIXME: Constant loads should all be marked invariant.
10396 unsigned AS = Ld->getAddressSpace();
10397 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10399 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10400 return SDValue();
10401
10402 // Don't do this early, since it may interfere with adjacent load merging for
10403 // illegal types. We can avoid losing alignment information for exotic types
10404 // pre-legalize.
10405 EVT MemVT = Ld->getMemoryVT();
10406 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10407 MemVT.getSizeInBits() >= 32)
10408 return SDValue();
10409
10410 SDLoc SL(Ld);
10411
10412 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10413 "unexpected vector extload");
10414
10415 // TODO: Drop only high part of range.
10416 SDValue Ptr = Ld->getBasePtr();
10417 SDValue NewLoad = DAG.getLoad(
10418 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10419 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10420 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10421 nullptr); // Drop ranges
10422
10423 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10424 if (MemVT.isFloatingPoint()) {
10426 "unexpected fp extload");
10427 TruncVT = MemVT.changeTypeToInteger();
10428 }
10429
10430 SDValue Cvt = NewLoad;
10431 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10432 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10433 DAG.getValueType(TruncVT));
10434 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10436 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10437 } else {
10439 }
10440
10441 EVT VT = Ld->getValueType(0);
10442 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10443
10444 DCI.AddToWorklist(Cvt.getNode());
10445
10446 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10447 // the appropriate extension from the 32-bit load.
10448 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10449 DCI.AddToWorklist(Cvt.getNode());
10450
10451 // Handle conversion back to floating point if necessary.
10452 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10453
10454 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10455}
10456
10458 const SIMachineFunctionInfo &Info) {
10459 // TODO: Should check if the address can definitely not access stack.
10460 if (Info.isEntryFunction())
10461 return Info.getUserSGPRInfo().hasFlatScratchInit();
10462 return true;
10463}
10464
10465SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10466 SDLoc DL(Op);
10467 LoadSDNode *Load = cast<LoadSDNode>(Op);
10468 ISD::LoadExtType ExtType = Load->getExtensionType();
10469 EVT MemVT = Load->getMemoryVT();
10470 MachineMemOperand *MMO = Load->getMemOperand();
10471
10472 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10473 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10474 return SDValue();
10475
10476 // FIXME: Copied from PPC
10477 // First, load into 32 bits, then truncate to 1 bit.
10478
10479 SDValue Chain = Load->getChain();
10480 SDValue BasePtr = Load->getBasePtr();
10481
10482 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10483
10484 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10485 RealMemVT, MMO);
10486
10487 if (!MemVT.isVector()) {
10488 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10489 NewLD.getValue(1)};
10490
10491 return DAG.getMergeValues(Ops, DL);
10492 }
10493
10495 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10496 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10497 DAG.getConstant(I, DL, MVT::i32));
10498
10499 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10500 }
10501
10502 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10503
10504 return DAG.getMergeValues(Ops, DL);
10505 }
10506
10507 if (!MemVT.isVector())
10508 return SDValue();
10509
10510 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10511 "Custom lowering for non-i32 vectors hasn't been implemented.");
10512
10513 Align Alignment = Load->getAlign();
10514 unsigned AS = Load->getAddressSpace();
10515 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10516 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10517 return SplitVectorLoad(Op, DAG);
10518 }
10519
10522 // If there is a possibility that flat instruction access scratch memory
10523 // then we need to use the same legalization rules we use for private.
10524 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10526 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10529
10530 unsigned NumElements = MemVT.getVectorNumElements();
10531
10532 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10534 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10535 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10537 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10538 Alignment >= Align(4) && NumElements < 32) {
10539 if (MemVT.isPow2VectorType() ||
10540 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10541 return SDValue();
10542 return WidenOrSplitVectorLoad(Op, DAG);
10543 }
10544 // Non-uniform loads will be selected to MUBUF instructions, so they
10545 // have the same legalization requirements as global and private
10546 // loads.
10547 //
10548 }
10549 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10552 if (NumElements > 4)
10553 return SplitVectorLoad(Op, DAG);
10554 // v3 loads not supported on SI.
10555 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10556 return WidenOrSplitVectorLoad(Op, DAG);
10557
10558 // v3 and v4 loads are supported for private and global memory.
10559 return SDValue();
10560 }
10561 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10562 // Depending on the setting of the private_element_size field in the
10563 // resource descriptor, we can only make private accesses up to a certain
10564 // size.
10565 switch (Subtarget->getMaxPrivateElementSize()) {
10566 case 4: {
10567 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10568 return DAG.getMergeValues({Op0, Op1}, DL);
10569 }
10570 case 8:
10571 if (NumElements > 2)
10572 return SplitVectorLoad(Op, DAG);
10573 return SDValue();
10574 case 16:
10575 // Same as global/flat
10576 if (NumElements > 4)
10577 return SplitVectorLoad(Op, DAG);
10578 // v3 loads not supported on SI.
10579 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10580 return WidenOrSplitVectorLoad(Op, DAG);
10581
10582 return SDValue();
10583 default:
10584 llvm_unreachable("unsupported private_element_size");
10585 }
10586 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10587 unsigned Fast = 0;
10588 auto Flags = Load->getMemOperand()->getFlags();
10590 Load->getAlign(), Flags, &Fast) &&
10591 Fast > 1)
10592 return SDValue();
10593
10594 if (MemVT.isVector())
10595 return SplitVectorLoad(Op, DAG);
10596 }
10597
10599 MemVT, *Load->getMemOperand())) {
10600 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10601 return DAG.getMergeValues({Op0, Op1}, DL);
10602 }
10603
10604 return SDValue();
10605}
10606
10607SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10608 EVT VT = Op.getValueType();
10609 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10610 VT.getSizeInBits() == 512)
10611 return splitTernaryVectorOp(Op, DAG);
10612
10613 assert(VT.getSizeInBits() == 64);
10614
10615 SDLoc DL(Op);
10616 SDValue Cond = Op.getOperand(0);
10617
10618 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10619 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10620
10621 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10622 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10623
10624 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10625 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10626
10627 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10628
10629 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10630 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10631
10632 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10633
10634 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10635 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10636}
10637
10638// Catch division cases where we can use shortcuts with rcp and rsq
10639// instructions.
10640SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10641 SelectionDAG &DAG) const {
10642 SDLoc SL(Op);
10643 SDValue LHS = Op.getOperand(0);
10644 SDValue RHS = Op.getOperand(1);
10645 EVT VT = Op.getValueType();
10646 const SDNodeFlags Flags = Op->getFlags();
10647
10648 bool AllowInaccurateRcp =
10649 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10650
10651 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10652 // Without !fpmath accuracy information, we can't do more because we don't
10653 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10654 // f16 is always accurate enough
10655 if (!AllowInaccurateRcp && VT != MVT::f16)
10656 return SDValue();
10657
10658 if (CLHS->isExactlyValue(1.0)) {
10659 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10660 // the CI documentation has a worst case error of 1 ulp.
10661 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10662 // use it as long as we aren't trying to use denormals.
10663 //
10664 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10665
10666 // 1.0 / sqrt(x) -> rsq(x)
10667
10668 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10669 // error seems really high at 2^29 ULP.
10670 // 1.0 / x -> rcp(x)
10671 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10672 }
10673
10674 // Same as for 1.0, but expand the sign out of the constant.
10675 if (CLHS->isExactlyValue(-1.0)) {
10676 // -1.0 / x -> rcp (fneg x)
10677 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10678 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10679 }
10680 }
10681
10682 // For f16 require afn or arcp.
10683 // For f32 require afn.
10684 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10685 return SDValue();
10686
10687 // Turn into multiply by the reciprocal.
10688 // x / y -> x * (1.0 / y)
10689 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10690 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10691}
10692
10693SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10694 SelectionDAG &DAG) const {
10695 SDLoc SL(Op);
10696 SDValue X = Op.getOperand(0);
10697 SDValue Y = Op.getOperand(1);
10698 EVT VT = Op.getValueType();
10699 const SDNodeFlags Flags = Op->getFlags();
10700
10701 bool AllowInaccurateDiv =
10702 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10703 if (!AllowInaccurateDiv)
10704 return SDValue();
10705
10706 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10707 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10708
10709 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10710 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10711
10712 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10713 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10714 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10715 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10716 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10717 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10718}
10719
10720static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10721 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10722 SDNodeFlags Flags) {
10723 if (GlueChain->getNumValues() <= 1) {
10724 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10725 }
10726
10727 assert(GlueChain->getNumValues() == 3);
10728
10729 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10730 switch (Opcode) {
10731 default:
10732 llvm_unreachable("no chain equivalent for opcode");
10733 case ISD::FMUL:
10734 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10735 break;
10736 }
10737
10738 return DAG.getNode(Opcode, SL, VTList,
10739 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10740 Flags);
10741}
10742
10743static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10744 EVT VT, SDValue A, SDValue B, SDValue C,
10745 SDValue GlueChain, SDNodeFlags Flags) {
10746 if (GlueChain->getNumValues() <= 1) {
10747 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10748 }
10749
10750 assert(GlueChain->getNumValues() == 3);
10751
10752 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10753 switch (Opcode) {
10754 default:
10755 llvm_unreachable("no chain equivalent for opcode");
10756 case ISD::FMA:
10757 Opcode = AMDGPUISD::FMA_W_CHAIN;
10758 break;
10759 }
10760
10761 return DAG.getNode(Opcode, SL, VTList,
10762 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10763 Flags);
10764}
10765
10766SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10767 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10768 return FastLowered;
10769
10770 SDLoc SL(Op);
10771 SDValue LHS = Op.getOperand(0);
10772 SDValue RHS = Op.getOperand(1);
10773
10774 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10775 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10776 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10777 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10778 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10779 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10780 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10781 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10782 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10783 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10784 // q16.u = opx(V_CVT_F16_F32, q32.u);
10785 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10786
10787 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10788 unsigned FMADOpCode =
10790
10791 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10792 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10793 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10794 SDValue Rcp =
10795 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10796 SDValue Quot =
10797 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10798 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10799 Op->getFlags());
10800 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10801 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10802 Op->getFlags());
10803 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10804 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10805 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10806 DAG.getConstant(0xff800000, SL, MVT::i32));
10807 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10808 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10809 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10810 DAG.getTargetConstant(0, SL, MVT::i32));
10811 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10812 Op->getFlags());
10813}
10814
10815// Faster 2.5 ULP division that does not support denormals.
10816SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10817 SDNodeFlags Flags = Op->getFlags();
10818 SDLoc SL(Op);
10819 SDValue LHS = Op.getOperand(1);
10820 SDValue RHS = Op.getOperand(2);
10821
10822 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10823
10824 const APFloat K0Val(0x1p+96f);
10825 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10826
10827 const APFloat K1Val(0x1p-32f);
10828 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10829
10830 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10831
10832 EVT SetCCVT =
10833 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10834
10835 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10836
10837 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10838
10839 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10840
10841 // rcp does not support denormals.
10842 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10843
10844 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10845
10846 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10847}
10848
10849// Returns immediate value for setting the F32 denorm mode when using the
10850// S_DENORM_MODE instruction.
10852 const SIMachineFunctionInfo *Info,
10853 const GCNSubtarget *ST) {
10854 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10855 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10856 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10857 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10858}
10859
10860SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10861 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10862 return FastLowered;
10863
10864 // The selection matcher assumes anything with a chain selecting to a
10865 // mayRaiseFPException machine instruction. Since we're introducing a chain
10866 // here, we need to explicitly report nofpexcept for the regular fdiv
10867 // lowering.
10868 SDNodeFlags Flags = Op->getFlags();
10869 Flags.setNoFPExcept(true);
10870
10871 SDLoc SL(Op);
10872 SDValue LHS = Op.getOperand(0);
10873 SDValue RHS = Op.getOperand(1);
10874
10875 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10876
10877 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10878
10879 SDValue DenominatorScaled =
10880 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10881 SDValue NumeratorScaled =
10882 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10883
10884 // Denominator is scaled to not be denormal, so using rcp is ok.
10885 SDValue ApproxRcp =
10886 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10887 SDValue NegDivScale0 =
10888 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10889
10890 using namespace AMDGPU::Hwreg;
10891 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10892 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10893
10894 const MachineFunction &MF = DAG.getMachineFunction();
10896 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10897
10898 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10899 const bool HasDynamicDenormals =
10900 (DenormMode.Input == DenormalMode::Dynamic) ||
10901 (DenormMode.Output == DenormalMode::Dynamic);
10902
10903 SDValue SavedDenormMode;
10904
10905 if (!PreservesDenormals) {
10906 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10907 // lowering. The chain dependence is insufficient, and we need glue. We do
10908 // not need the glue variants in a strictfp function.
10909
10910 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10911
10912 SDValue Glue = DAG.getEntryNode();
10913 if (HasDynamicDenormals) {
10914 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10915 DAG.getVTList(MVT::i32, MVT::Glue),
10916 {BitField, Glue});
10917 SavedDenormMode = SDValue(GetReg, 0);
10918
10919 Glue = DAG.getMergeValues(
10920 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10921 }
10922
10923 SDNode *EnableDenorm;
10924 if (Subtarget->hasDenormModeInst()) {
10925 const SDValue EnableDenormValue =
10926 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10927
10928 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10929 EnableDenormValue)
10930 .getNode();
10931 } else {
10932 const SDValue EnableDenormValue =
10933 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
10934 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10935 {EnableDenormValue, BitField, Glue});
10936 }
10937
10938 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
10939 SDValue(EnableDenorm, 1)};
10940
10941 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10942 }
10943
10944 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10945 ApproxRcp, One, NegDivScale0, Flags);
10946
10947 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10948 ApproxRcp, Fma0, Flags);
10949
10950 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
10951 Fma1, Flags);
10952
10953 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10954 NumeratorScaled, Mul, Flags);
10955
10956 SDValue Fma3 =
10957 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
10958
10959 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10960 NumeratorScaled, Fma3, Flags);
10961
10962 if (!PreservesDenormals) {
10963 SDNode *DisableDenorm;
10964 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10965 const SDValue DisableDenormValue = getSPDenormModeValue(
10966 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10967
10968 DisableDenorm =
10969 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
10970 DisableDenormValue, Fma4.getValue(2))
10971 .getNode();
10972 } else {
10973 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10974 const SDValue DisableDenormValue =
10975 HasDynamicDenormals
10976 ? SavedDenormMode
10977 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10978
10979 DisableDenorm = DAG.getMachineNode(
10980 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10981 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10982 }
10983
10984 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10985 SDValue(DisableDenorm, 0), DAG.getRoot());
10986 DAG.setRoot(OutputChain);
10987 }
10988
10989 SDValue Scale = NumeratorScaled.getValue(1);
10990 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10991 {Fma4, Fma1, Fma3, Scale}, Flags);
10992
10993 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10994}
10995
10996SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10997 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10998 return FastLowered;
10999
11000 SDLoc SL(Op);
11001 SDValue X = Op.getOperand(0);
11002 SDValue Y = Op.getOperand(1);
11003
11004 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11005
11006 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11007
11008 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11009
11010 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11011
11012 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11013
11014 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11015
11016 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11017
11018 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11019
11020 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11021
11022 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11023 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11024
11025 SDValue Fma4 =
11026 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11027
11028 SDValue Scale;
11029
11030 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11031 // Workaround a hardware bug on SI where the condition output from div_scale
11032 // is not usable.
11033
11034 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11035
11036 // Figure out if the scale to use for div_fmas.
11037 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11038 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11039 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11040 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11041
11042 SDValue NumHi =
11043 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11044 SDValue DenHi =
11045 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11046
11047 SDValue Scale0Hi =
11048 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11049 SDValue Scale1Hi =
11050 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11051
11052 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11053 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11054 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11055 } else {
11056 Scale = DivScale1.getValue(1);
11057 }
11058
11059 SDValue Fmas =
11060 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11061
11062 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11063}
11064
11065SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11066 EVT VT = Op.getValueType();
11067
11068 if (VT == MVT::f32)
11069 return LowerFDIV32(Op, DAG);
11070
11071 if (VT == MVT::f64)
11072 return LowerFDIV64(Op, DAG);
11073
11074 if (VT == MVT::f16)
11075 return LowerFDIV16(Op, DAG);
11076
11077 llvm_unreachable("Unexpected type for fdiv");
11078}
11079
11080SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11081 SDLoc dl(Op);
11082 SDValue Val = Op.getOperand(0);
11083 EVT VT = Val.getValueType();
11084 EVT ResultExpVT = Op->getValueType(1);
11085 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11086
11087 SDValue Mant = DAG.getNode(
11089 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11090
11091 SDValue Exp = DAG.getNode(
11092 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11093 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11094
11095 if (Subtarget->hasFractBug()) {
11096 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11097 SDValue Inf =
11099
11100 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11101 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11102 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11103 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11104 }
11105
11106 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11107 return DAG.getMergeValues({Mant, CastExp}, dl);
11108}
11109
11110SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11111 SDLoc DL(Op);
11112 StoreSDNode *Store = cast<StoreSDNode>(Op);
11113 EVT VT = Store->getMemoryVT();
11114
11115 if (VT == MVT::i1) {
11116 return DAG.getTruncStore(
11117 Store->getChain(), DL,
11118 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11119 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11120 }
11121
11122 assert(VT.isVector() &&
11123 Store->getValue().getValueType().getScalarType() == MVT::i32);
11124
11125 unsigned AS = Store->getAddressSpace();
11126 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11127 Store->getAlign().value() < VT.getStoreSize() &&
11128 VT.getSizeInBits() > 32) {
11129 return SplitVectorStore(Op, DAG);
11130 }
11131
11134 // If there is a possibility that flat instruction access scratch memory
11135 // then we need to use the same legalization rules we use for private.
11136 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11138 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11141
11142 unsigned NumElements = VT.getVectorNumElements();
11144 if (NumElements > 4)
11145 return SplitVectorStore(Op, DAG);
11146 // v3 stores not supported on SI.
11147 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11148 return SplitVectorStore(Op, DAG);
11149
11151 VT, *Store->getMemOperand()))
11152 return expandUnalignedStore(Store, DAG);
11153
11154 return SDValue();
11155 }
11156 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11157 switch (Subtarget->getMaxPrivateElementSize()) {
11158 case 4:
11159 return scalarizeVectorStore(Store, DAG);
11160 case 8:
11161 if (NumElements > 2)
11162 return SplitVectorStore(Op, DAG);
11163 return SDValue();
11164 case 16:
11165 if (NumElements > 4 ||
11166 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11167 return SplitVectorStore(Op, DAG);
11168 return SDValue();
11169 default:
11170 llvm_unreachable("unsupported private_element_size");
11171 }
11172 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11173 unsigned Fast = 0;
11174 auto Flags = Store->getMemOperand()->getFlags();
11176 Store->getAlign(), Flags, &Fast) &&
11177 Fast > 1)
11178 return SDValue();
11179
11180 if (VT.isVector())
11181 return SplitVectorStore(Op, DAG);
11182
11183 return expandUnalignedStore(Store, DAG);
11184 }
11185
11186 // Probably an invalid store. If so we'll end up emitting a selection error.
11187 return SDValue();
11188}
11189
11190// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11191SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11192 SDLoc SL(Op);
11193 assert(!Subtarget->has16BitInsts());
11194 SDNodeFlags Flags = Op->getFlags();
11195 SDValue Ext =
11196 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11197
11198 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11199 SDValue Sqrt =
11200 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11201
11202 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11203 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11204}
11205
11206SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11207 SDLoc DL(Op);
11208 SDNodeFlags Flags = Op->getFlags();
11209 MVT VT = Op.getValueType().getSimpleVT();
11210 const SDValue X = Op.getOperand(0);
11211
11212 if (allowApproxFunc(DAG, Flags)) {
11213 // Instruction is 1ulp but ignores denormals.
11214 return DAG.getNode(
11216 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11217 }
11218
11219 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11220 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11221
11222 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11223
11224 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11225
11226 SDValue SqrtX =
11227 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11228
11229 SDValue SqrtS;
11230 if (needsDenormHandlingF32(DAG, X, Flags)) {
11231 SDValue SqrtID =
11232 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11233 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11234
11235 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11236 SDValue SqrtSNextDownInt =
11237 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11238 DAG.getAllOnesConstant(DL, MVT::i32));
11239 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11240
11241 SDValue NegSqrtSNextDown =
11242 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11243
11244 SDValue SqrtVP =
11245 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11246
11247 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11248 DAG.getConstant(1, DL, MVT::i32));
11249 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11250
11251 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11252 SDValue SqrtVS =
11253 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11254
11255 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11256 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11257
11258 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11259 Flags);
11260
11261 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11262 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11263 Flags);
11264 } else {
11265 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11266
11267 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11268
11269 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11270 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11271 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11272
11273 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11274 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11275 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11276
11277 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11278 SDValue SqrtD =
11279 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11280 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11281 }
11282
11283 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11284
11285 SDValue ScaledDown =
11286 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11287
11288 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11289 SDValue IsZeroOrInf =
11290 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11291 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11292
11293 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11294}
11295
11296SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11297 // For double type, the SQRT and RSQ instructions don't have required
11298 // precision, we apply Goldschmidt's algorithm to improve the result:
11299 //
11300 // y0 = rsq(x)
11301 // g0 = x * y0
11302 // h0 = 0.5 * y0
11303 //
11304 // r0 = 0.5 - h0 * g0
11305 // g1 = g0 * r0 + g0
11306 // h1 = h0 * r0 + h0
11307 //
11308 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11309 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11310 // h2 = h1 * r1 + h1
11311 //
11312 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11313 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11314 //
11315 // sqrt(x) = g3
11316
11317 SDNodeFlags Flags = Op->getFlags();
11318
11319 SDLoc DL(Op);
11320
11321 SDValue X = Op.getOperand(0);
11322 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11323
11324 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11325
11326 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11327
11328 // Scale up input if it is too small.
11329 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11330 SDValue ScaleUp =
11331 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11332 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11333
11334 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11335
11336 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11337
11338 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11339 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11340
11341 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11342 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11343
11344 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11345
11346 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11347
11348 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11349 SDValue SqrtD0 =
11350 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11351
11352 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11353
11354 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11355 SDValue SqrtD1 =
11356 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11357
11358 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11359
11360 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11361 SDValue ScaleDown =
11362 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11363 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11364
11365 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11366 // with finite only or nsz because rsq(+/-0) = +/-inf
11367
11368 // TODO: Check for DAZ and expand to subnormals
11369 SDValue IsZeroOrInf =
11370 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11371 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11372
11373 // If x is +INF, +0, or -0, use its original value
11374 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11375 Flags);
11376}
11377
11378SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11379 SDLoc DL(Op);
11380 EVT VT = Op.getValueType();
11381 SDValue Arg = Op.getOperand(0);
11382 SDValue TrigVal;
11383
11384 // Propagate fast-math flags so that the multiply we introduce can be folded
11385 // if Arg is already the result of a multiply by constant.
11386 auto Flags = Op->getFlags();
11387
11388 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11389
11390 if (Subtarget->hasTrigReducedRange()) {
11391 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11392 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11393 } else {
11394 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11395 }
11396
11397 switch (Op.getOpcode()) {
11398 case ISD::FCOS:
11399 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11400 case ISD::FSIN:
11401 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11402 default:
11403 llvm_unreachable("Wrong trig opcode");
11404 }
11405}
11406
11407SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11408 SelectionDAG &DAG) const {
11409 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11410 assert(AtomicNode->isCompareAndSwap());
11411 unsigned AS = AtomicNode->getAddressSpace();
11412
11413 // No custom lowering required for local address space
11415 return Op;
11416
11417 // Non-local address space requires custom lowering for atomic compare
11418 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11419 SDLoc DL(Op);
11420 SDValue ChainIn = Op.getOperand(0);
11421 SDValue Addr = Op.getOperand(1);
11422 SDValue Old = Op.getOperand(2);
11423 SDValue New = Op.getOperand(3);
11424 EVT VT = Op.getValueType();
11425 MVT SimpleVT = VT.getSimpleVT();
11426 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11427
11428 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11429 SDValue Ops[] = {ChainIn, Addr, NewOld};
11430
11432 Op->getVTList(), Ops, VT,
11433 AtomicNode->getMemOperand());
11434}
11435
11436//===----------------------------------------------------------------------===//
11437// Custom DAG optimizations
11438//===----------------------------------------------------------------------===//
11439
11440SDValue
11441SITargetLowering::performUCharToFloatCombine(SDNode *N,
11442 DAGCombinerInfo &DCI) const {
11443 EVT VT = N->getValueType(0);
11444 EVT ScalarVT = VT.getScalarType();
11445 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11446 return SDValue();
11447
11448 SelectionDAG &DAG = DCI.DAG;
11449 SDLoc DL(N);
11450
11451 SDValue Src = N->getOperand(0);
11452 EVT SrcVT = Src.getValueType();
11453
11454 // TODO: We could try to match extracting the higher bytes, which would be
11455 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11456 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11457 // about in practice.
11458 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11459 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11460 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11461 DCI.AddToWorklist(Cvt.getNode());
11462
11463 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11464 if (ScalarVT != MVT::f32) {
11465 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11466 DAG.getTargetConstant(0, DL, MVT::i32));
11467 }
11468 return Cvt;
11469 }
11470 }
11471
11472 return SDValue();
11473}
11474
11475SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11476 DAGCombinerInfo &DCI) const {
11477 SDValue MagnitudeOp = N->getOperand(0);
11478 SDValue SignOp = N->getOperand(1);
11479 SelectionDAG &DAG = DCI.DAG;
11480 SDLoc DL(N);
11481
11482 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11483 // lower half with a copy.
11484 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11485 if (MagnitudeOp.getValueType() == MVT::f64) {
11486 SDValue MagAsVector =
11487 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11488 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11489 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11490 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11491 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11492
11493 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11494
11495 SDValue Vector =
11496 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11497
11498 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11499 }
11500
11501 if (SignOp.getValueType() != MVT::f64)
11502 return SDValue();
11503
11504 // Reduce width of sign operand, we only need the highest bit.
11505 //
11506 // fcopysign f64:x, f64:y ->
11507 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11508 // TODO: In some cases it might make sense to go all the way to f16.
11509 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11510 SDValue SignAsF32 =
11511 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11512 DAG.getConstant(1, DL, MVT::i32));
11513
11514 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11515 SignAsF32);
11516}
11517
11518// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11519// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11520// bits
11521
11522// This is a variant of
11523// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11524//
11525// The normal DAG combiner will do this, but only if the add has one use since
11526// that would increase the number of instructions.
11527//
11528// This prevents us from seeing a constant offset that can be folded into a
11529// memory instruction's addressing mode. If we know the resulting add offset of
11530// a pointer can be folded into an addressing offset, we can replace the pointer
11531// operand with the add of new constant offset. This eliminates one of the uses,
11532// and may allow the remaining use to also be simplified.
11533//
11534SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11535 EVT MemVT,
11536 DAGCombinerInfo &DCI) const {
11537 SDValue N0 = N->getOperand(0);
11538 SDValue N1 = N->getOperand(1);
11539
11540 // We only do this to handle cases where it's profitable when there are
11541 // multiple uses of the add, so defer to the standard combine.
11542 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11543 N0->hasOneUse())
11544 return SDValue();
11545
11546 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11547 if (!CN1)
11548 return SDValue();
11549
11550 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11551 if (!CAdd)
11552 return SDValue();
11553
11554 SelectionDAG &DAG = DCI.DAG;
11555
11556 if (N0->getOpcode() == ISD::OR &&
11557 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11558 return SDValue();
11559
11560 // If the resulting offset is too large, we can't fold it into the
11561 // addressing mode offset.
11562 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11563 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11564
11565 AddrMode AM;
11566 AM.HasBaseReg = true;
11567 AM.BaseOffs = Offset.getSExtValue();
11568 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11569 return SDValue();
11570
11571 SDLoc SL(N);
11572 EVT VT = N->getValueType(0);
11573
11574 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11575 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11576
11578 Flags.setNoUnsignedWrap(
11579 N->getFlags().hasNoUnsignedWrap() &&
11580 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11581
11582 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11583}
11584
11585/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11586/// by the chain and intrinsic ID. Theoretically we would also need to check the
11587/// specific intrinsic, but they all place the pointer operand first.
11588static unsigned getBasePtrIndex(const MemSDNode *N) {
11589 switch (N->getOpcode()) {
11590 case ISD::STORE:
11593 return 2;
11594 default:
11595 return 1;
11596 }
11597}
11598
11599SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11600 DAGCombinerInfo &DCI) const {
11601 SelectionDAG &DAG = DCI.DAG;
11602 SDLoc SL(N);
11603
11604 unsigned PtrIdx = getBasePtrIndex(N);
11605 SDValue Ptr = N->getOperand(PtrIdx);
11606
11607 // TODO: We could also do this for multiplies.
11608 if (Ptr.getOpcode() == ISD::SHL) {
11609 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11610 N->getMemoryVT(), DCI);
11611 if (NewPtr) {
11612 SmallVector<SDValue, 8> NewOps(N->ops());
11613
11614 NewOps[PtrIdx] = NewPtr;
11615 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11616 }
11617 }
11618
11619 return SDValue();
11620}
11621
11622static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11623 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11624 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11625 (Opc == ISD::XOR && Val == 0);
11626}
11627
11628// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11629// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11630// integer combine opportunities since most 64-bit operations are decomposed
11631// this way. TODO: We won't want this for SALU especially if it is an inline
11632// immediate.
11633SDValue SITargetLowering::splitBinaryBitConstantOp(
11634 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11635 const ConstantSDNode *CRHS) const {
11636 uint64_t Val = CRHS->getZExtValue();
11637 uint32_t ValLo = Lo_32(Val);
11638 uint32_t ValHi = Hi_32(Val);
11640
11641 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11642 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11643 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11644 // If we need to materialize a 64-bit immediate, it will be split up later
11645 // anyway. Avoid creating the harder to understand 64-bit immediate
11646 // materialization.
11647 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11648 }
11649
11650 return SDValue();
11651}
11652
11654 if (V.getValueType() != MVT::i1)
11655 return false;
11656 switch (V.getOpcode()) {
11657 default:
11658 break;
11659 case ISD::SETCC:
11661 return true;
11662 case ISD::AND:
11663 case ISD::OR:
11664 case ISD::XOR:
11665 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11666 }
11667 return false;
11668}
11669
11670// If a constant has all zeroes or all ones within each byte return it.
11671// Otherwise return 0.
11673 // 0xff for any zero byte in the mask
11674 uint32_t ZeroByteMask = 0;
11675 if (!(C & 0x000000ff))
11676 ZeroByteMask |= 0x000000ff;
11677 if (!(C & 0x0000ff00))
11678 ZeroByteMask |= 0x0000ff00;
11679 if (!(C & 0x00ff0000))
11680 ZeroByteMask |= 0x00ff0000;
11681 if (!(C & 0xff000000))
11682 ZeroByteMask |= 0xff000000;
11683 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11684 if ((NonZeroByteMask & C) != NonZeroByteMask)
11685 return 0; // Partial bytes selected.
11686 return C;
11687}
11688
11689// Check if a node selects whole bytes from its operand 0 starting at a byte
11690// boundary while masking the rest. Returns select mask as in the v_perm_b32
11691// or -1 if not succeeded.
11692// Note byte select encoding:
11693// value 0-3 selects corresponding source byte;
11694// value 0xc selects zero;
11695// value 0xff selects 0xff.
11697 assert(V.getValueSizeInBits() == 32);
11698
11699 if (V.getNumOperands() != 2)
11700 return ~0;
11701
11702 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11703 if (!N1)
11704 return ~0;
11705
11706 uint32_t C = N1->getZExtValue();
11707
11708 switch (V.getOpcode()) {
11709 default:
11710 break;
11711 case ISD::AND:
11712 if (uint32_t ConstMask = getConstantPermuteMask(C))
11713 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11714 break;
11715
11716 case ISD::OR:
11717 if (uint32_t ConstMask = getConstantPermuteMask(C))
11718 return (0x03020100 & ~ConstMask) | ConstMask;
11719 break;
11720
11721 case ISD::SHL:
11722 if (C % 8)
11723 return ~0;
11724
11725 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11726
11727 case ISD::SRL:
11728 if (C % 8)
11729 return ~0;
11730
11731 return uint32_t(0x0c0c0c0c03020100ull >> C);
11732 }
11733
11734 return ~0;
11735}
11736
11737SDValue SITargetLowering::performAndCombine(SDNode *N,
11738 DAGCombinerInfo &DCI) const {
11739 if (DCI.isBeforeLegalize())
11740 return SDValue();
11741
11742 SelectionDAG &DAG = DCI.DAG;
11743 EVT VT = N->getValueType(0);
11744 SDValue LHS = N->getOperand(0);
11745 SDValue RHS = N->getOperand(1);
11746
11747 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11748 if (VT == MVT::i64 && CRHS) {
11749 if (SDValue Split =
11750 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11751 return Split;
11752 }
11753
11754 if (CRHS && VT == MVT::i32) {
11755 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11756 // nb = number of trailing zeroes in mask
11757 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11758 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11759 uint64_t Mask = CRHS->getZExtValue();
11760 unsigned Bits = llvm::popcount(Mask);
11761 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11762 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11763 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11764 unsigned Shift = CShift->getZExtValue();
11765 unsigned NB = CRHS->getAPIntValue().countr_zero();
11766 unsigned Offset = NB + Shift;
11767 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11768 SDLoc SL(N);
11769 SDValue BFE =
11770 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11771 DAG.getConstant(Offset, SL, MVT::i32),
11772 DAG.getConstant(Bits, SL, MVT::i32));
11773 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11774 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11775 DAG.getValueType(NarrowVT));
11776 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11777 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11778 return Shl;
11779 }
11780 }
11781 }
11782
11783 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11784 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11785 isa<ConstantSDNode>(LHS.getOperand(2))) {
11786 uint32_t Sel = getConstantPermuteMask(Mask);
11787 if (!Sel)
11788 return SDValue();
11789
11790 // Select 0xc for all zero bytes
11791 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11792 SDLoc DL(N);
11793 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11794 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11795 }
11796 }
11797
11798 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11799 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11800 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11801 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11802 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11803
11804 SDValue X = LHS.getOperand(0);
11805 SDValue Y = RHS.getOperand(0);
11806 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11807 !isTypeLegal(X.getValueType()))
11808 return SDValue();
11809
11810 if (LCC == ISD::SETO) {
11811 if (X != LHS.getOperand(1))
11812 return SDValue();
11813
11814 if (RCC == ISD::SETUNE) {
11815 const ConstantFPSDNode *C1 =
11816 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11817 if (!C1 || !C1->isInfinity() || C1->isNegative())
11818 return SDValue();
11819
11824
11825 static_assert(
11828 0x3ff) == Mask,
11829 "mask not equal");
11830
11831 SDLoc DL(N);
11832 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11833 DAG.getConstant(Mask, DL, MVT::i32));
11834 }
11835 }
11836 }
11837
11838 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11839 std::swap(LHS, RHS);
11840
11841 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11842 RHS.hasOneUse()) {
11843 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11844 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11845 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11846 // | n_nan)
11847 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11848 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11849 (RHS.getOperand(0) == LHS.getOperand(0) &&
11850 LHS.getOperand(0) == LHS.getOperand(1))) {
11851 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11852 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11853 : Mask->getZExtValue() & OrdMask;
11854
11855 SDLoc DL(N);
11856 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11857 DAG.getConstant(NewMask, DL, MVT::i32));
11858 }
11859 }
11860
11861 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11862 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11863 // and x, (sext cc from i1) => select cc, x, 0
11864 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11865 std::swap(LHS, RHS);
11866 if (isBoolSGPR(RHS.getOperand(0)))
11867 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11868 DAG.getConstant(0, SDLoc(N), MVT::i32));
11869 }
11870
11871 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11873 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11874 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11875 uint32_t LHSMask = getPermuteMask(LHS);
11876 uint32_t RHSMask = getPermuteMask(RHS);
11877 if (LHSMask != ~0u && RHSMask != ~0u) {
11878 // Canonicalize the expression in an attempt to have fewer unique masks
11879 // and therefore fewer registers used to hold the masks.
11880 if (LHSMask > RHSMask) {
11881 std::swap(LHSMask, RHSMask);
11882 std::swap(LHS, RHS);
11883 }
11884
11885 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11886 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11887 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11888 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11889
11890 // Check of we need to combine values from two sources within a byte.
11891 if (!(LHSUsedLanes & RHSUsedLanes) &&
11892 // If we select high and lower word keep it for SDWA.
11893 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11894 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11895 // Each byte in each mask is either selector mask 0-3, or has higher
11896 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11897 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11898 // mask which is not 0xff wins. By anding both masks we have a correct
11899 // result except that 0x0c shall be corrected to give 0x0c only.
11900 uint32_t Mask = LHSMask & RHSMask;
11901 for (unsigned I = 0; I < 32; I += 8) {
11902 uint32_t ByteSel = 0xff << I;
11903 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11904 Mask &= (0x0c << I) & 0xffffffff;
11905 }
11906
11907 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11908 // or 0x0c.
11909 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11910 SDLoc DL(N);
11911
11912 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11913 RHS.getOperand(0),
11914 DAG.getConstant(Sel, DL, MVT::i32));
11915 }
11916 }
11917 }
11918
11919 return SDValue();
11920}
11921
11922// A key component of v_perm is a mapping between byte position of the src
11923// operands, and the byte position of the dest. To provide such, we need: 1. the
11924// node that provides x byte of the dest of the OR, and 2. the byte of the node
11925// used to provide that x byte. calculateByteProvider finds which node provides
11926// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11927// and finds an ultimate src and byte position For example: The supported
11928// LoadCombine pattern for vector loads is as follows
11929// t1
11930// or
11931// / \
11932// t2 t3
11933// zext shl
11934// | | \
11935// t4 t5 16
11936// or anyext
11937// / \ |
11938// t6 t7 t8
11939// srl shl or
11940// / | / \ / \
11941// t9 t10 t11 t12 t13 t14
11942// trunc* 8 trunc* 8 and and
11943// | | / | | \
11944// t15 t16 t17 t18 t19 t20
11945// trunc* 255 srl -256
11946// | / \
11947// t15 t15 16
11948//
11949// *In this example, the truncs are from i32->i16
11950//
11951// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11952// respectively. calculateSrcByte would find (given node) -> ultimate src &
11953// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11954// After finding the mapping, we can combine the tree into vperm t15, t16,
11955// 0x05000407
11956
11957// Find the source and byte position from a node.
11958// \p DestByte is the byte position of the dest of the or that the src
11959// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11960// dest of the or byte. \p Depth tracks how many recursive iterations we have
11961// performed.
11962static const std::optional<ByteProvider<SDValue>>
11963calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11964 unsigned Depth = 0) {
11965 // We may need to recursively traverse a series of SRLs
11966 if (Depth >= 6)
11967 return std::nullopt;
11968
11969 if (Op.getValueSizeInBits() < 8)
11970 return std::nullopt;
11971
11972 if (Op.getValueType().isVector())
11973 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11974
11975 switch (Op->getOpcode()) {
11976 case ISD::TRUNCATE: {
11977 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11978 }
11979
11980 case ISD::SIGN_EXTEND:
11981 case ISD::ZERO_EXTEND:
11983 SDValue NarrowOp = Op->getOperand(0);
11984 auto NarrowVT = NarrowOp.getValueType();
11985 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11986 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11987 NarrowVT = VTSign->getVT();
11988 }
11989 if (!NarrowVT.isByteSized())
11990 return std::nullopt;
11991 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11992
11993 if (SrcIndex >= NarrowByteWidth)
11994 return std::nullopt;
11995 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11996 }
11997
11998 case ISD::SRA:
11999 case ISD::SRL: {
12000 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12001 if (!ShiftOp)
12002 return std::nullopt;
12003
12004 uint64_t BitShift = ShiftOp->getZExtValue();
12005
12006 if (BitShift % 8 != 0)
12007 return std::nullopt;
12008
12009 SrcIndex += BitShift / 8;
12010
12011 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12012 }
12013
12014 default: {
12015 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12016 }
12017 }
12018 llvm_unreachable("fully handled switch");
12019}
12020
12021// For a byte position in the result of an Or, traverse the tree and find the
12022// node (and the byte of the node) which ultimately provides this {Or,
12023// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12024// the byte position of the Op that corresponds with the originally requested
12025// byte of the Or \p Depth tracks how many recursive iterations we have
12026// performed. \p StartingIndex is the originally requested byte of the Or
12027static const std::optional<ByteProvider<SDValue>>
12028calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12029 unsigned StartingIndex = 0) {
12030 // Finding Src tree of RHS of or typically requires at least 1 additional
12031 // depth
12032 if (Depth > 6)
12033 return std::nullopt;
12034
12035 unsigned BitWidth = Op.getScalarValueSizeInBits();
12036 if (BitWidth % 8 != 0)
12037 return std::nullopt;
12038 if (Index > BitWidth / 8 - 1)
12039 return std::nullopt;
12040
12041 bool IsVec = Op.getValueType().isVector();
12042 switch (Op.getOpcode()) {
12043 case ISD::OR: {
12044 if (IsVec)
12045 return std::nullopt;
12046
12047 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12048 StartingIndex);
12049 if (!RHS)
12050 return std::nullopt;
12051 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12052 StartingIndex);
12053 if (!LHS)
12054 return std::nullopt;
12055 // A well formed Or will have two ByteProviders for each byte, one of which
12056 // is constant zero
12057 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12058 return std::nullopt;
12059 if (!LHS || LHS->isConstantZero())
12060 return RHS;
12061 if (!RHS || RHS->isConstantZero())
12062 return LHS;
12063 return std::nullopt;
12064 }
12065
12066 case ISD::AND: {
12067 if (IsVec)
12068 return std::nullopt;
12069
12070 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12071 if (!BitMaskOp)
12072 return std::nullopt;
12073
12074 uint32_t BitMask = BitMaskOp->getZExtValue();
12075 // Bits we expect for our StartingIndex
12076 uint32_t IndexMask = 0xFF << (Index * 8);
12077
12078 if ((IndexMask & BitMask) != IndexMask) {
12079 // If the result of the and partially provides the byte, then it
12080 // is not well formatted
12081 if (IndexMask & BitMask)
12082 return std::nullopt;
12084 }
12085
12086 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12087 }
12088
12089 case ISD::FSHR: {
12090 if (IsVec)
12091 return std::nullopt;
12092
12093 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12094 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12095 if (!ShiftOp || Op.getValueType().isVector())
12096 return std::nullopt;
12097
12098 uint64_t BitsProvided = Op.getValueSizeInBits();
12099 if (BitsProvided % 8 != 0)
12100 return std::nullopt;
12101
12102 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12103 if (BitShift % 8)
12104 return std::nullopt;
12105
12106 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12107 uint64_t ByteShift = BitShift / 8;
12108
12109 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12110 uint64_t BytesProvided = BitsProvided / 8;
12111 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12112 NewIndex %= BytesProvided;
12113 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12114 }
12115
12116 case ISD::SRA:
12117 case ISD::SRL: {
12118 if (IsVec)
12119 return std::nullopt;
12120
12121 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12122 if (!ShiftOp)
12123 return std::nullopt;
12124
12125 uint64_t BitShift = ShiftOp->getZExtValue();
12126 if (BitShift % 8)
12127 return std::nullopt;
12128
12129 auto BitsProvided = Op.getScalarValueSizeInBits();
12130 if (BitsProvided % 8 != 0)
12131 return std::nullopt;
12132
12133 uint64_t BytesProvided = BitsProvided / 8;
12134 uint64_t ByteShift = BitShift / 8;
12135 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12136 // If the byte we are trying to provide (as tracked by index) falls in this
12137 // range, then the SRL provides the byte. The byte of interest of the src of
12138 // the SRL is Index + ByteShift
12139 return BytesProvided - ByteShift > Index
12140 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12141 Index + ByteShift)
12143 }
12144
12145 case ISD::SHL: {
12146 if (IsVec)
12147 return std::nullopt;
12148
12149 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12150 if (!ShiftOp)
12151 return std::nullopt;
12152
12153 uint64_t BitShift = ShiftOp->getZExtValue();
12154 if (BitShift % 8 != 0)
12155 return std::nullopt;
12156 uint64_t ByteShift = BitShift / 8;
12157
12158 // If we are shifting by an amount greater than (or equal to)
12159 // the index we are trying to provide, then it provides 0s. If not,
12160 // then this bytes are not definitively 0s, and the corresponding byte
12161 // of interest is Index - ByteShift of the src
12162 return Index < ByteShift
12164 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12165 Depth + 1, StartingIndex);
12166 }
12167 case ISD::ANY_EXTEND:
12168 case ISD::SIGN_EXTEND:
12169 case ISD::ZERO_EXTEND:
12171 case ISD::AssertZext:
12172 case ISD::AssertSext: {
12173 if (IsVec)
12174 return std::nullopt;
12175
12176 SDValue NarrowOp = Op->getOperand(0);
12177 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12178 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12179 Op->getOpcode() == ISD::AssertZext ||
12180 Op->getOpcode() == ISD::AssertSext) {
12181 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12182 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12183 }
12184 if (NarrowBitWidth % 8 != 0)
12185 return std::nullopt;
12186 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12187
12188 if (Index >= NarrowByteWidth)
12189 return Op.getOpcode() == ISD::ZERO_EXTEND
12190 ? std::optional<ByteProvider<SDValue>>(
12192 : std::nullopt;
12193 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12194 }
12195
12196 case ISD::TRUNCATE: {
12197 if (IsVec)
12198 return std::nullopt;
12199
12200 uint64_t NarrowByteWidth = BitWidth / 8;
12201
12202 if (NarrowByteWidth >= Index) {
12203 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12204 StartingIndex);
12205 }
12206
12207 return std::nullopt;
12208 }
12209
12210 case ISD::CopyFromReg: {
12211 if (BitWidth / 8 > Index)
12212 return calculateSrcByte(Op, StartingIndex, Index);
12213
12214 return std::nullopt;
12215 }
12216
12217 case ISD::LOAD: {
12218 auto *L = cast<LoadSDNode>(Op.getNode());
12219
12220 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12221 if (NarrowBitWidth % 8 != 0)
12222 return std::nullopt;
12223 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12224
12225 // If the width of the load does not reach byte we are trying to provide for
12226 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12227 // question
12228 if (Index >= NarrowByteWidth) {
12229 return L->getExtensionType() == ISD::ZEXTLOAD
12230 ? std::optional<ByteProvider<SDValue>>(
12232 : std::nullopt;
12233 }
12234
12235 if (NarrowByteWidth > Index) {
12236 return calculateSrcByte(Op, StartingIndex, Index);
12237 }
12238
12239 return std::nullopt;
12240 }
12241
12242 case ISD::BSWAP: {
12243 if (IsVec)
12244 return std::nullopt;
12245
12246 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12247 Depth + 1, StartingIndex);
12248 }
12249
12251 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12252 if (!IdxOp)
12253 return std::nullopt;
12254 auto VecIdx = IdxOp->getZExtValue();
12255 auto ScalarSize = Op.getScalarValueSizeInBits();
12256 if (ScalarSize < 32)
12257 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12258 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12259 StartingIndex, Index);
12260 }
12261
12262 case AMDGPUISD::PERM: {
12263 if (IsVec)
12264 return std::nullopt;
12265
12266 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12267 if (!PermMask)
12268 return std::nullopt;
12269
12270 auto IdxMask =
12271 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12272 if (IdxMask > 0x07 && IdxMask != 0x0c)
12273 return std::nullopt;
12274
12275 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12276 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12277
12278 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12281 }
12282
12283 default: {
12284 return std::nullopt;
12285 }
12286 }
12287
12288 llvm_unreachable("fully handled switch");
12289}
12290
12291// Returns true if the Operand is a scalar and is 16 bits
12292static bool isExtendedFrom16Bits(SDValue &Operand) {
12293
12294 switch (Operand.getOpcode()) {
12295 case ISD::ANY_EXTEND:
12296 case ISD::SIGN_EXTEND:
12297 case ISD::ZERO_EXTEND: {
12298 auto OpVT = Operand.getOperand(0).getValueType();
12299 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12300 }
12301 case ISD::LOAD: {
12302 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12303 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12304 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12305 ExtType == ISD::EXTLOAD) {
12306 auto MemVT = L->getMemoryVT();
12307 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12308 }
12309 return L->getMemoryVT().getSizeInBits() == 16;
12310 }
12311 default:
12312 return false;
12313 }
12314}
12315
12316// Returns true if the mask matches consecutive bytes, and the first byte
12317// begins at a power of 2 byte offset from 0th byte
12318static bool addresses16Bits(int Mask) {
12319 int Low8 = Mask & 0xff;
12320 int Hi8 = (Mask & 0xff00) >> 8;
12321
12322 assert(Low8 < 8 && Hi8 < 8);
12323 // Are the bytes contiguous in the order of increasing addresses.
12324 bool IsConsecutive = (Hi8 - Low8 == 1);
12325 // Is the first byte at location that is aligned for 16 bit instructions.
12326 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12327 // In this case, we still need code to extract the 16 bit operand, so it
12328 // is better to use i8 v_perm
12329 bool Is16Aligned = !(Low8 % 2);
12330
12331 return IsConsecutive && Is16Aligned;
12332}
12333
12334// Do not lower into v_perm if the operands are actually 16 bit
12335// and the selected bits (based on PermMask) correspond with two
12336// easily addressable 16 bit operands.
12338 SDValue &OtherOp) {
12339 int Low16 = PermMask & 0xffff;
12340 int Hi16 = (PermMask & 0xffff0000) >> 16;
12341
12342 auto TempOp = peekThroughBitcasts(Op);
12343 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12344
12345 auto OpIs16Bit =
12346 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12347 if (!OpIs16Bit)
12348 return true;
12349
12350 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12351 isExtendedFrom16Bits(TempOtherOp);
12352 if (!OtherOpIs16Bit)
12353 return true;
12354
12355 // Do we cleanly address both
12356 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12357}
12358
12360 unsigned DWordOffset) {
12361 SDValue Ret;
12362
12363 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12364 // ByteProvider must be at least 8 bits
12365 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12366
12367 if (TypeSize <= 32)
12368 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12369
12370 if (Src.getValueType().isVector()) {
12371 auto ScalarTySize = Src.getScalarValueSizeInBits();
12372 auto ScalarTy = Src.getValueType().getScalarType();
12373 if (ScalarTySize == 32) {
12374 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12375 DAG.getConstant(DWordOffset, SL, MVT::i32));
12376 }
12377 if (ScalarTySize > 32) {
12378 Ret = DAG.getNode(
12379 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12380 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12381 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12382 if (ShiftVal)
12383 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12384 DAG.getConstant(ShiftVal, SL, MVT::i32));
12385 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12386 }
12387
12388 assert(ScalarTySize < 32);
12389 auto NumElements = TypeSize / ScalarTySize;
12390 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12391 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12392 auto NumElementsIn32 = 32 / ScalarTySize;
12393 auto NumAvailElements = DWordOffset < Trunc32Elements
12394 ? NumElementsIn32
12395 : NumElements - NormalizedTrunc;
12396
12398 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12399 NumAvailElements);
12400
12401 Ret = DAG.getBuildVector(
12402 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12403 VecSrcs);
12404 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12405 }
12406
12407 /// Scalar Type
12408 auto ShiftVal = 32 * DWordOffset;
12409 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12410 DAG.getConstant(ShiftVal, SL, MVT::i32));
12411 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12412}
12413
12415 SelectionDAG &DAG = DCI.DAG;
12416 [[maybe_unused]] EVT VT = N->getValueType(0);
12418
12419 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12420 assert(VT == MVT::i32);
12421 for (int i = 0; i < 4; i++) {
12422 // Find the ByteProvider that provides the ith byte of the result of OR
12423 std::optional<ByteProvider<SDValue>> P =
12424 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12425 // TODO support constantZero
12426 if (!P || P->isConstantZero())
12427 return SDValue();
12428
12429 PermNodes.push_back(*P);
12430 }
12431 if (PermNodes.size() != 4)
12432 return SDValue();
12433
12434 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12435 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12436 uint64_t PermMask = 0x00000000;
12437 for (size_t i = 0; i < PermNodes.size(); i++) {
12438 auto PermOp = PermNodes[i];
12439 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12440 // by sizeof(Src2) = 4
12441 int SrcByteAdjust = 4;
12442
12443 // If the Src uses a byte from a different DWORD, then it corresponds
12444 // with a difference source
12445 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12446 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12447 if (SecondSrc)
12448 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12449 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12450 return SDValue();
12451
12452 // Set the index of the second distinct Src node
12453 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12454 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12455 SrcByteAdjust = 0;
12456 }
12457 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12459 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12460 }
12461 SDLoc DL(N);
12462 SDValue Op = *PermNodes[FirstSrc.first].Src;
12463 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12464 assert(Op.getValueSizeInBits() == 32);
12465
12466 // Check that we are not just extracting the bytes in order from an op
12467 if (!SecondSrc) {
12468 int Low16 = PermMask & 0xffff;
12469 int Hi16 = (PermMask & 0xffff0000) >> 16;
12470
12471 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12472 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12473
12474 // The perm op would really just produce Op. So combine into Op
12475 if (WellFormedLow && WellFormedHi)
12476 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12477 }
12478
12479 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12480
12481 if (SecondSrc) {
12482 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12483 assert(OtherOp.getValueSizeInBits() == 32);
12484 }
12485
12486 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12487
12488 assert(Op.getValueType().isByteSized() &&
12489 OtherOp.getValueType().isByteSized());
12490
12491 // If the ultimate src is less than 32 bits, then we will only be
12492 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12493 // CalculateByteProvider would not have returned Op as source if we
12494 // used a byte that is outside its ValueType. Thus, we are free to
12495 // ANY_EXTEND as the extended bits are dont-cares.
12496 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12497 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12498
12499 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12500 DAG.getConstant(PermMask, DL, MVT::i32));
12501 }
12502 return SDValue();
12503}
12504
12505SDValue SITargetLowering::performOrCombine(SDNode *N,
12506 DAGCombinerInfo &DCI) const {
12507 SelectionDAG &DAG = DCI.DAG;
12508 SDValue LHS = N->getOperand(0);
12509 SDValue RHS = N->getOperand(1);
12510
12511 EVT VT = N->getValueType(0);
12512 if (VT == MVT::i1) {
12513 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12514 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12515 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12516 SDValue Src = LHS.getOperand(0);
12517 if (Src != RHS.getOperand(0))
12518 return SDValue();
12519
12520 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12521 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12522 if (!CLHS || !CRHS)
12523 return SDValue();
12524
12525 // Only 10 bits are used.
12526 static const uint32_t MaxMask = 0x3ff;
12527
12528 uint32_t NewMask =
12529 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12530 SDLoc DL(N);
12531 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12532 DAG.getConstant(NewMask, DL, MVT::i32));
12533 }
12534
12535 return SDValue();
12536 }
12537
12538 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12539 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12540 LHS.getOpcode() == AMDGPUISD::PERM &&
12541 isa<ConstantSDNode>(LHS.getOperand(2))) {
12542 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12543 if (!Sel)
12544 return SDValue();
12545
12546 Sel |= LHS.getConstantOperandVal(2);
12547 SDLoc DL(N);
12548 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12549 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12550 }
12551
12552 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12554 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12555 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12556
12557 // If all the uses of an or need to extract the individual elements, do not
12558 // attempt to lower into v_perm
12559 auto usesCombinedOperand = [](SDNode *OrUse) {
12560 // If we have any non-vectorized use, then it is a candidate for v_perm
12561 if (OrUse->getOpcode() != ISD::BITCAST ||
12562 !OrUse->getValueType(0).isVector())
12563 return true;
12564
12565 // If we have any non-vectorized use, then it is a candidate for v_perm
12566 for (auto *VUser : OrUse->users()) {
12567 if (!VUser->getValueType(0).isVector())
12568 return true;
12569
12570 // If the use of a vector is a store, then combining via a v_perm
12571 // is beneficial.
12572 // TODO -- whitelist more uses
12573 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12574 if (VUser->getOpcode() == VectorwiseOp)
12575 return true;
12576 }
12577 return false;
12578 };
12579
12580 if (!any_of(N->users(), usesCombinedOperand))
12581 return SDValue();
12582
12583 uint32_t LHSMask = getPermuteMask(LHS);
12584 uint32_t RHSMask = getPermuteMask(RHS);
12585
12586 if (LHSMask != ~0u && RHSMask != ~0u) {
12587 // Canonicalize the expression in an attempt to have fewer unique masks
12588 // and therefore fewer registers used to hold the masks.
12589 if (LHSMask > RHSMask) {
12590 std::swap(LHSMask, RHSMask);
12591 std::swap(LHS, RHS);
12592 }
12593
12594 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12595 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12596 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12597 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12598
12599 // Check of we need to combine values from two sources within a byte.
12600 if (!(LHSUsedLanes & RHSUsedLanes) &&
12601 // If we select high and lower word keep it for SDWA.
12602 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12603 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12604 // Kill zero bytes selected by other mask. Zero value is 0xc.
12605 LHSMask &= ~RHSUsedLanes;
12606 RHSMask &= ~LHSUsedLanes;
12607 // Add 4 to each active LHS lane
12608 LHSMask |= LHSUsedLanes & 0x04040404;
12609 // Combine masks
12610 uint32_t Sel = LHSMask | RHSMask;
12611 SDLoc DL(N);
12612
12613 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12614 RHS.getOperand(0),
12615 DAG.getConstant(Sel, DL, MVT::i32));
12616 }
12617 }
12618 if (LHSMask == ~0u || RHSMask == ~0u) {
12619 if (SDValue Perm = matchPERM(N, DCI))
12620 return Perm;
12621 }
12622 }
12623
12624 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12625 return SDValue();
12626
12627 // TODO: This could be a generic combine with a predicate for extracting the
12628 // high half of an integer being free.
12629
12630 // (or i64:x, (zero_extend i32:y)) ->
12631 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12632 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12633 RHS.getOpcode() != ISD::ZERO_EXTEND)
12634 std::swap(LHS, RHS);
12635
12636 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12637 SDValue ExtSrc = RHS.getOperand(0);
12638 EVT SrcVT = ExtSrc.getValueType();
12639 if (SrcVT == MVT::i32) {
12640 SDLoc SL(N);
12641 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12642 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12643
12644 DCI.AddToWorklist(LowOr.getNode());
12645 DCI.AddToWorklist(HiBits.getNode());
12646
12647 SDValue Vec =
12648 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12649 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12650 }
12651 }
12652
12653 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12654 if (CRHS) {
12655 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12656 N->getOperand(0), CRHS))
12657 return Split;
12658 }
12659
12660 return SDValue();
12661}
12662
12663SDValue SITargetLowering::performXorCombine(SDNode *N,
12664 DAGCombinerInfo &DCI) const {
12665 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12666 return RV;
12667
12668 SDValue LHS = N->getOperand(0);
12669 SDValue RHS = N->getOperand(1);
12670
12671 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12672 SelectionDAG &DAG = DCI.DAG;
12673
12674 EVT VT = N->getValueType(0);
12675 if (CRHS && VT == MVT::i64) {
12676 if (SDValue Split =
12677 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12678 return Split;
12679 }
12680
12681 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12682 // fneg-like xors into 64-bit select.
12683 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12684 // This looks like an fneg, try to fold as a source modifier.
12685 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12686 shouldFoldFNegIntoSrc(N, LHS)) {
12687 // xor (select c, a, b), 0x80000000 ->
12688 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12689 SDLoc DL(N);
12690 SDValue CastLHS =
12691 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12692 SDValue CastRHS =
12693 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12694 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12695 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12696 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12697 LHS->getOperand(0), FNegLHS, FNegRHS);
12698 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12699 }
12700 }
12701
12702 return SDValue();
12703}
12704
12705SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12706 DAGCombinerInfo &DCI) const {
12707 if (!Subtarget->has16BitInsts() ||
12708 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12709 return SDValue();
12710
12711 EVT VT = N->getValueType(0);
12712 if (VT != MVT::i32)
12713 return SDValue();
12714
12715 SDValue Src = N->getOperand(0);
12716 if (Src.getValueType() != MVT::i16)
12717 return SDValue();
12718
12719 return SDValue();
12720}
12721
12722SDValue
12723SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12724 DAGCombinerInfo &DCI) const {
12725 SDValue Src = N->getOperand(0);
12726 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12727
12728 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12729 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12730 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12731 VTSign->getVT() == MVT::i8) ||
12732 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12733 VTSign->getVT() == MVT::i16))) {
12734 assert(Subtarget->hasScalarSubwordLoads() &&
12735 "s_buffer_load_{u8, i8} are supported "
12736 "in GFX12 (or newer) architectures.");
12737 EVT VT = Src.getValueType();
12738 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12741 SDLoc DL(N);
12742 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12743 SDValue Ops[] = {
12744 Src.getOperand(0), // source register
12745 Src.getOperand(1), // offset
12746 Src.getOperand(2) // cachePolicy
12747 };
12748 auto *M = cast<MemSDNode>(Src);
12749 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12750 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12751 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12752 return LoadVal;
12753 }
12754 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12755 VTSign->getVT() == MVT::i8) ||
12756 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12757 VTSign->getVT() == MVT::i16)) &&
12758 Src.hasOneUse()) {
12759 auto *M = cast<MemSDNode>(Src);
12760 SDValue Ops[] = {Src.getOperand(0), // Chain
12761 Src.getOperand(1), // rsrc
12762 Src.getOperand(2), // vindex
12763 Src.getOperand(3), // voffset
12764 Src.getOperand(4), // soffset
12765 Src.getOperand(5), // offset
12766 Src.getOperand(6), Src.getOperand(7)};
12767 // replace with BUFFER_LOAD_BYTE/SHORT
12768 SDVTList ResList =
12769 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12770 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12773 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12774 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12775 return DCI.DAG.getMergeValues(
12776 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12777 }
12778 return SDValue();
12779}
12780
12781SDValue SITargetLowering::performClassCombine(SDNode *N,
12782 DAGCombinerInfo &DCI) const {
12783 SelectionDAG &DAG = DCI.DAG;
12784 SDValue Mask = N->getOperand(1);
12785
12786 // fp_class x, 0 -> false
12787 if (isNullConstant(Mask))
12788 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12789
12790 if (N->getOperand(0).isUndef())
12791 return DAG.getUNDEF(MVT::i1);
12792
12793 return SDValue();
12794}
12795
12796SDValue SITargetLowering::performRcpCombine(SDNode *N,
12797 DAGCombinerInfo &DCI) const {
12798 EVT VT = N->getValueType(0);
12799 SDValue N0 = N->getOperand(0);
12800
12801 if (N0.isUndef()) {
12802 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12803 SDLoc(N), VT);
12804 }
12805
12806 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12807 N0.getOpcode() == ISD::SINT_TO_FP)) {
12808 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12809 N->getFlags());
12810 }
12811
12812 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12813 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12814 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12815 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12816 N->getFlags());
12817 }
12818
12820}
12821
12823 unsigned MaxDepth) const {
12824 unsigned Opcode = Op.getOpcode();
12825 if (Opcode == ISD::FCANONICALIZE)
12826 return true;
12827
12828 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12829 const auto &F = CFP->getValueAPF();
12830 if (F.isNaN() && F.isSignaling())
12831 return false;
12832 if (!F.isDenormal())
12833 return true;
12834
12835 DenormalMode Mode =
12836 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12837 return Mode == DenormalMode::getIEEE();
12838 }
12839
12840 // If source is a result of another standard FP operation it is already in
12841 // canonical form.
12842 if (MaxDepth == 0)
12843 return false;
12844
12845 switch (Opcode) {
12846 // These will flush denorms if required.
12847 case ISD::FADD:
12848 case ISD::FSUB:
12849 case ISD::FMUL:
12850 case ISD::FCEIL:
12851 case ISD::FFLOOR:
12852 case ISD::FMA:
12853 case ISD::FMAD:
12854 case ISD::FSQRT:
12855 case ISD::FDIV:
12856 case ISD::FREM:
12857 case ISD::FP_ROUND:
12858 case ISD::FP_EXTEND:
12859 case ISD::FP16_TO_FP:
12860 case ISD::FP_TO_FP16:
12861 case ISD::BF16_TO_FP:
12862 case ISD::FP_TO_BF16:
12863 case ISD::FLDEXP:
12866 case AMDGPUISD::RCP:
12867 case AMDGPUISD::RSQ:
12871 case AMDGPUISD::LOG:
12872 case AMDGPUISD::EXP:
12876 case AMDGPUISD::FRACT:
12883 case AMDGPUISD::SIN_HW:
12884 case AMDGPUISD::COS_HW:
12885 return true;
12886
12887 // It can/will be lowered or combined as a bit operation.
12888 // Need to check their input recursively to handle.
12889 case ISD::FNEG:
12890 case ISD::FABS:
12891 case ISD::FCOPYSIGN:
12892 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12893
12894 case ISD::AND:
12895 if (Op.getValueType() == MVT::i32) {
12896 // Be careful as we only know it is a bitcast floating point type. It
12897 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12898 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12899 // is valid to optimize for all types.
12900 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12901 if (RHS->getZExtValue() == 0xffff0000) {
12902 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12903 }
12904 }
12905 }
12906 break;
12907
12908 case ISD::FSIN:
12909 case ISD::FCOS:
12910 case ISD::FSINCOS:
12911 return Op.getValueType().getScalarType() != MVT::f16;
12912
12913 case ISD::FMINNUM:
12914 case ISD::FMAXNUM:
12915 case ISD::FMINNUM_IEEE:
12916 case ISD::FMAXNUM_IEEE:
12917 case ISD::FMINIMUM:
12918 case ISD::FMAXIMUM:
12919 case AMDGPUISD::CLAMP:
12920 case AMDGPUISD::FMED3:
12921 case AMDGPUISD::FMAX3:
12922 case AMDGPUISD::FMIN3:
12924 case AMDGPUISD::FMINIMUM3: {
12925 // FIXME: Shouldn't treat the generic operations different based these.
12926 // However, we aren't really required to flush the result from
12927 // minnum/maxnum..
12928
12929 // snans will be quieted, so we only need to worry about denormals.
12930 if (Subtarget->supportsMinMaxDenormModes() ||
12931 // FIXME: denormalsEnabledForType is broken for dynamic
12932 denormalsEnabledForType(DAG, Op.getValueType()))
12933 return true;
12934
12935 // Flushing may be required.
12936 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12937 // targets need to check their input recursively.
12938
12939 // FIXME: Does this apply with clamp? It's implemented with max.
12940 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12941 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12942 return false;
12943 }
12944
12945 return true;
12946 }
12947 case ISD::SELECT: {
12948 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12949 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12950 }
12951 case ISD::BUILD_VECTOR: {
12952 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12953 SDValue SrcOp = Op.getOperand(i);
12954 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12955 return false;
12956 }
12957
12958 return true;
12959 }
12962 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12963 }
12965 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12966 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12967 }
12968 case ISD::UNDEF:
12969 // Could be anything.
12970 return false;
12971
12972 case ISD::BITCAST:
12973 // TODO: This is incorrect as it loses track of the operand's type. We may
12974 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12975 // same bits that are canonicalized in one type need not be in the other.
12976 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12977 case ISD::TRUNCATE: {
12978 // Hack round the mess we make when legalizing extract_vector_elt
12979 if (Op.getValueType() == MVT::i16) {
12980 SDValue TruncSrc = Op.getOperand(0);
12981 if (TruncSrc.getValueType() == MVT::i32 &&
12982 TruncSrc.getOpcode() == ISD::BITCAST &&
12983 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12984 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12985 }
12986 }
12987 return false;
12988 }
12990 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12991 // TODO: Handle more intrinsics
12992 switch (IntrinsicID) {
12993 case Intrinsic::amdgcn_cvt_pkrtz:
12994 case Intrinsic::amdgcn_cubeid:
12995 case Intrinsic::amdgcn_frexp_mant:
12996 case Intrinsic::amdgcn_fdot2:
12997 case Intrinsic::amdgcn_rcp:
12998 case Intrinsic::amdgcn_rsq:
12999 case Intrinsic::amdgcn_rsq_clamp:
13000 case Intrinsic::amdgcn_rcp_legacy:
13001 case Intrinsic::amdgcn_rsq_legacy:
13002 case Intrinsic::amdgcn_trig_preop:
13003 case Intrinsic::amdgcn_log:
13004 case Intrinsic::amdgcn_exp2:
13005 case Intrinsic::amdgcn_sqrt:
13006 return true;
13007 default:
13008 break;
13009 }
13010
13011 break;
13012 }
13013 default:
13014 break;
13015 }
13016
13017 // FIXME: denormalsEnabledForType is broken for dynamic
13018 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13019 DAG.isKnownNeverSNaN(Op);
13020}
13021
13023 unsigned MaxDepth) const {
13024 const MachineRegisterInfo &MRI = MF.getRegInfo();
13025 MachineInstr *MI = MRI.getVRegDef(Reg);
13026 unsigned Opcode = MI->getOpcode();
13027
13028 if (Opcode == AMDGPU::G_FCANONICALIZE)
13029 return true;
13030
13031 std::optional<FPValueAndVReg> FCR;
13032 // Constant splat (can be padded with undef) or scalar constant.
13033 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13034 if (FCR->Value.isSignaling())
13035 return false;
13036 if (!FCR->Value.isDenormal())
13037 return true;
13038
13039 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13040 return Mode == DenormalMode::getIEEE();
13041 }
13042
13043 if (MaxDepth == 0)
13044 return false;
13045
13046 switch (Opcode) {
13047 case AMDGPU::G_FADD:
13048 case AMDGPU::G_FSUB:
13049 case AMDGPU::G_FMUL:
13050 case AMDGPU::G_FCEIL:
13051 case AMDGPU::G_FFLOOR:
13052 case AMDGPU::G_FRINT:
13053 case AMDGPU::G_FNEARBYINT:
13054 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13055 case AMDGPU::G_INTRINSIC_TRUNC:
13056 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13057 case AMDGPU::G_FMA:
13058 case AMDGPU::G_FMAD:
13059 case AMDGPU::G_FSQRT:
13060 case AMDGPU::G_FDIV:
13061 case AMDGPU::G_FREM:
13062 case AMDGPU::G_FPOW:
13063 case AMDGPU::G_FPEXT:
13064 case AMDGPU::G_FLOG:
13065 case AMDGPU::G_FLOG2:
13066 case AMDGPU::G_FLOG10:
13067 case AMDGPU::G_FPTRUNC:
13068 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13069 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13070 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13071 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13072 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13073 return true;
13074 case AMDGPU::G_FNEG:
13075 case AMDGPU::G_FABS:
13076 case AMDGPU::G_FCOPYSIGN:
13077 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13078 case AMDGPU::G_FMINNUM:
13079 case AMDGPU::G_FMAXNUM:
13080 case AMDGPU::G_FMINNUM_IEEE:
13081 case AMDGPU::G_FMAXNUM_IEEE:
13082 case AMDGPU::G_FMINIMUM:
13083 case AMDGPU::G_FMAXIMUM: {
13084 if (Subtarget->supportsMinMaxDenormModes() ||
13085 // FIXME: denormalsEnabledForType is broken for dynamic
13086 denormalsEnabledForType(MRI.getType(Reg), MF))
13087 return true;
13088
13089 [[fallthrough]];
13090 }
13091 case AMDGPU::G_BUILD_VECTOR:
13092 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13093 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13094 return false;
13095 return true;
13096 case AMDGPU::G_INTRINSIC:
13097 case AMDGPU::G_INTRINSIC_CONVERGENT:
13098 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13099 case Intrinsic::amdgcn_fmul_legacy:
13100 case Intrinsic::amdgcn_fmad_ftz:
13101 case Intrinsic::amdgcn_sqrt:
13102 case Intrinsic::amdgcn_fmed3:
13103 case Intrinsic::amdgcn_sin:
13104 case Intrinsic::amdgcn_cos:
13105 case Intrinsic::amdgcn_log:
13106 case Intrinsic::amdgcn_exp2:
13107 case Intrinsic::amdgcn_log_clamp:
13108 case Intrinsic::amdgcn_rcp:
13109 case Intrinsic::amdgcn_rcp_legacy:
13110 case Intrinsic::amdgcn_rsq:
13111 case Intrinsic::amdgcn_rsq_clamp:
13112 case Intrinsic::amdgcn_rsq_legacy:
13113 case Intrinsic::amdgcn_div_scale:
13114 case Intrinsic::amdgcn_div_fmas:
13115 case Intrinsic::amdgcn_div_fixup:
13116 case Intrinsic::amdgcn_fract:
13117 case Intrinsic::amdgcn_cvt_pkrtz:
13118 case Intrinsic::amdgcn_cubeid:
13119 case Intrinsic::amdgcn_cubema:
13120 case Intrinsic::amdgcn_cubesc:
13121 case Intrinsic::amdgcn_cubetc:
13122 case Intrinsic::amdgcn_frexp_mant:
13123 case Intrinsic::amdgcn_fdot2:
13124 case Intrinsic::amdgcn_trig_preop:
13125 return true;
13126 default:
13127 break;
13128 }
13129
13130 [[fallthrough]];
13131 default:
13132 return false;
13133 }
13134
13135 llvm_unreachable("invalid operation");
13136}
13137
13138// Constant fold canonicalize.
13139SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13140 const SDLoc &SL, EVT VT,
13141 const APFloat &C) const {
13142 // Flush denormals to 0 if not enabled.
13143 if (C.isDenormal()) {
13144 DenormalMode Mode =
13145 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13146 if (Mode == DenormalMode::getPreserveSign()) {
13147 return DAG.getConstantFP(
13148 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13149 }
13150
13151 if (Mode != DenormalMode::getIEEE())
13152 return SDValue();
13153 }
13154
13155 if (C.isNaN()) {
13156 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13157 if (C.isSignaling()) {
13158 // Quiet a signaling NaN.
13159 // FIXME: Is this supposed to preserve payload bits?
13160 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13161 }
13162
13163 // Make sure it is the canonical NaN bitpattern.
13164 //
13165 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13166 // immediate?
13167 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13168 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13169 }
13170
13171 // Already canonical.
13172 return DAG.getConstantFP(C, SL, VT);
13173}
13174
13176 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13177}
13178
13179SDValue
13180SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13181 DAGCombinerInfo &DCI) const {
13182 SelectionDAG &DAG = DCI.DAG;
13183 SDValue N0 = N->getOperand(0);
13184 EVT VT = N->getValueType(0);
13185
13186 // fcanonicalize undef -> qnan
13187 if (N0.isUndef()) {
13189 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13190 }
13191
13192 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13193 EVT VT = N->getValueType(0);
13194 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13195 }
13196
13197 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13198 // (fcanonicalize k)
13199 //
13200 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13201
13202 // TODO: This could be better with wider vectors that will be split to v2f16,
13203 // and to consider uses since there aren't that many packed operations.
13204 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13205 isTypeLegal(MVT::v2f16)) {
13206 SDLoc SL(N);
13207 SDValue NewElts[2];
13208 SDValue Lo = N0.getOperand(0);
13209 SDValue Hi = N0.getOperand(1);
13210 EVT EltVT = Lo.getValueType();
13211
13213 for (unsigned I = 0; I != 2; ++I) {
13214 SDValue Op = N0.getOperand(I);
13215 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13216 NewElts[I] =
13217 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13218 } else if (Op.isUndef()) {
13219 // Handled below based on what the other operand is.
13220 NewElts[I] = Op;
13221 } else {
13222 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13223 }
13224 }
13225
13226 // If one half is undef, and one is constant, prefer a splat vector rather
13227 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13228 // cheaper to use and may be free with a packed operation.
13229 if (NewElts[0].isUndef()) {
13230 if (isa<ConstantFPSDNode>(NewElts[1]))
13231 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13232 ? NewElts[1]
13233 : DAG.getConstantFP(0.0f, SL, EltVT);
13234 }
13235
13236 if (NewElts[1].isUndef()) {
13237 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13238 ? NewElts[0]
13239 : DAG.getConstantFP(0.0f, SL, EltVT);
13240 }
13241
13242 return DAG.getBuildVector(VT, SL, NewElts);
13243 }
13244 }
13245
13246 return SDValue();
13247}
13248
13249static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13250 switch (Opc) {
13251 case ISD::FMAXNUM:
13252 case ISD::FMAXNUM_IEEE:
13253 return AMDGPUISD::FMAX3;
13254 case ISD::FMAXIMUM:
13255 return AMDGPUISD::FMAXIMUM3;
13256 case ISD::SMAX:
13257 return AMDGPUISD::SMAX3;
13258 case ISD::UMAX:
13259 return AMDGPUISD::UMAX3;
13260 case ISD::FMINNUM:
13261 case ISD::FMINNUM_IEEE:
13262 return AMDGPUISD::FMIN3;
13263 case ISD::FMINIMUM:
13264 return AMDGPUISD::FMINIMUM3;
13265 case ISD::SMIN:
13266 return AMDGPUISD::SMIN3;
13267 case ISD::UMIN:
13268 return AMDGPUISD::UMIN3;
13269 default:
13270 llvm_unreachable("Not a min/max opcode");
13271 }
13272}
13273
13274SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13275 const SDLoc &SL, SDValue Src,
13276 SDValue MinVal,
13277 SDValue MaxVal,
13278 bool Signed) const {
13279
13280 // med3 comes from
13281 // min(max(x, K0), K1), K0 < K1
13282 // max(min(x, K0), K1), K1 < K0
13283 //
13284 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13285 // min/max op.
13286 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13287 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13288
13289 if (!MinK || !MaxK)
13290 return SDValue();
13291
13292 if (Signed) {
13293 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13294 return SDValue();
13295 } else {
13296 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13297 return SDValue();
13298 }
13299
13300 EVT VT = MinK->getValueType(0);
13301 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13302 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13303 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13304
13305 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13306 // not available, but this is unlikely to be profitable as constants
13307 // will often need to be materialized & extended, especially on
13308 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13309 return SDValue();
13310}
13311
13313 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13314 return C;
13315
13316 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13317 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13318 return C;
13319 }
13320
13321 return nullptr;
13322}
13323
13324SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13325 const SDLoc &SL, SDValue Op0,
13326 SDValue Op1) const {
13328 if (!K1)
13329 return SDValue();
13330
13332 if (!K0)
13333 return SDValue();
13334
13335 // Ordered >= (although NaN inputs should have folded away by now).
13336 if (K0->getValueAPF() > K1->getValueAPF())
13337 return SDValue();
13338
13339 const MachineFunction &MF = DAG.getMachineFunction();
13341
13342 // TODO: Check IEEE bit enabled?
13343 EVT VT = Op0.getValueType();
13344 if (Info->getMode().DX10Clamp) {
13345 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13346 // hardware fmed3 behavior converting to a min.
13347 // FIXME: Should this be allowing -0.0?
13348 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13349 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13350 }
13351
13352 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13353 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13354 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13355 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13356 // then give the other result, which is different from med3 with a NaN
13357 // input.
13358 SDValue Var = Op0.getOperand(0);
13359 if (!DAG.isKnownNeverSNaN(Var))
13360 return SDValue();
13361
13363
13364 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13365 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13366 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13367 SDValue(K0, 0), SDValue(K1, 0));
13368 }
13369 }
13370
13371 return SDValue();
13372}
13373
13374/// \return true if the subtarget supports minimum3 and maximum3 with the given
13375/// base min/max opcode \p Opc for type \p VT.
13376static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13377 EVT VT) {
13378 switch (Opc) {
13379 case ISD::FMINNUM:
13380 case ISD::FMAXNUM:
13381 case ISD::FMINNUM_IEEE:
13382 case ISD::FMAXNUM_IEEE:
13385 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13386 case ISD::FMINIMUM:
13387 case ISD::FMAXIMUM:
13388 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13389 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13390 case ISD::SMAX:
13391 case ISD::SMIN:
13392 case ISD::UMAX:
13393 case ISD::UMIN:
13394 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13395 default:
13396 return false;
13397 }
13398
13399 llvm_unreachable("not a min/max opcode");
13400}
13401
13402SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13403 DAGCombinerInfo &DCI) const {
13404 SelectionDAG &DAG = DCI.DAG;
13405
13406 EVT VT = N->getValueType(0);
13407 unsigned Opc = N->getOpcode();
13408 SDValue Op0 = N->getOperand(0);
13409 SDValue Op1 = N->getOperand(1);
13410
13411 // Only do this if the inner op has one use since this will just increases
13412 // register pressure for no benefit.
13413
13414 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13415 // max(max(a, b), c) -> max3(a, b, c)
13416 // min(min(a, b), c) -> min3(a, b, c)
13417 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13418 SDLoc DL(N);
13419 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13420 Op0.getOperand(0), Op0.getOperand(1), Op1);
13421 }
13422
13423 // Try commuted.
13424 // max(a, max(b, c)) -> max3(a, b, c)
13425 // min(a, min(b, c)) -> min3(a, b, c)
13426 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13427 SDLoc DL(N);
13428 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13429 Op0, Op1.getOperand(0), Op1.getOperand(1));
13430 }
13431 }
13432
13433 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13434 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13435 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13436 if (SDValue Med3 = performIntMed3ImmCombine(
13437 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13438 return Med3;
13439 }
13440 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13441 if (SDValue Med3 = performIntMed3ImmCombine(
13442 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13443 return Med3;
13444 }
13445
13446 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13447 if (SDValue Med3 = performIntMed3ImmCombine(
13448 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13449 return Med3;
13450 }
13451 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13452 if (SDValue Med3 = performIntMed3ImmCombine(
13453 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13454 return Med3;
13455 }
13456
13457 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13458 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13459 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13460 (Opc == AMDGPUISD::FMIN_LEGACY &&
13461 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13462 (VT == MVT::f32 || VT == MVT::f64 ||
13463 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13464 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13465 Op0.hasOneUse()) {
13466 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13467 return Res;
13468 }
13469
13470 return SDValue();
13471}
13472
13474 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13475 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13476 // FIXME: Should this be allowing -0.0?
13477 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13478 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13479 }
13480 }
13481
13482 return false;
13483}
13484
13485// FIXME: Should only worry about snans for version with chain.
13486SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13487 DAGCombinerInfo &DCI) const {
13488 EVT VT = N->getValueType(0);
13489 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13490 // NaNs. With a NaN input, the order of the operands may change the result.
13491
13492 SelectionDAG &DAG = DCI.DAG;
13493 SDLoc SL(N);
13494
13495 SDValue Src0 = N->getOperand(0);
13496 SDValue Src1 = N->getOperand(1);
13497 SDValue Src2 = N->getOperand(2);
13498
13499 if (isClampZeroToOne(Src0, Src1)) {
13500 // const_a, const_b, x -> clamp is safe in all cases including signaling
13501 // nans.
13502 // FIXME: Should this be allowing -0.0?
13503 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13504 }
13505
13506 const MachineFunction &MF = DAG.getMachineFunction();
13508
13509 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13510 // handling no dx10-clamp?
13511 if (Info->getMode().DX10Clamp) {
13512 // If NaNs is clamped to 0, we are free to reorder the inputs.
13513
13514 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13515 std::swap(Src0, Src1);
13516
13517 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13518 std::swap(Src1, Src2);
13519
13520 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13521 std::swap(Src0, Src1);
13522
13523 if (isClampZeroToOne(Src1, Src2))
13524 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13525 }
13526
13527 return SDValue();
13528}
13529
13530SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13531 DAGCombinerInfo &DCI) const {
13532 SDValue Src0 = N->getOperand(0);
13533 SDValue Src1 = N->getOperand(1);
13534 if (Src0.isUndef() && Src1.isUndef())
13535 return DCI.DAG.getUNDEF(N->getValueType(0));
13536 return SDValue();
13537}
13538
13539// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13540// expanded into a set of cmp/select instructions.
13542 unsigned NumElem,
13543 bool IsDivergentIdx,
13544 const GCNSubtarget *Subtarget) {
13546 return false;
13547
13548 unsigned VecSize = EltSize * NumElem;
13549
13550 // Sub-dword vectors of size 2 dword or less have better implementation.
13551 if (VecSize <= 64 && EltSize < 32)
13552 return false;
13553
13554 // Always expand the rest of sub-dword instructions, otherwise it will be
13555 // lowered via memory.
13556 if (EltSize < 32)
13557 return true;
13558
13559 // Always do this if var-idx is divergent, otherwise it will become a loop.
13560 if (IsDivergentIdx)
13561 return true;
13562
13563 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13564 unsigned NumInsts = NumElem /* Number of compares */ +
13565 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13566
13567 // On some architectures (GFX9) movrel is not available and it's better
13568 // to expand.
13569 if (Subtarget->useVGPRIndexMode())
13570 return NumInsts <= 16;
13571
13572 // If movrel is available, use it instead of expanding for vector of 8
13573 // elements.
13574 if (Subtarget->hasMovrel())
13575 return NumInsts <= 15;
13576
13577 return true;
13578}
13579
13581 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13582 if (isa<ConstantSDNode>(Idx))
13583 return false;
13584
13585 SDValue Vec = N->getOperand(0);
13586 EVT VecVT = Vec.getValueType();
13587 EVT EltVT = VecVT.getVectorElementType();
13588 unsigned EltSize = EltVT.getSizeInBits();
13589 unsigned NumElem = VecVT.getVectorNumElements();
13590
13592 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13593}
13594
13595SDValue
13596SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13597 DAGCombinerInfo &DCI) const {
13598 SDValue Vec = N->getOperand(0);
13599 SelectionDAG &DAG = DCI.DAG;
13600
13601 EVT VecVT = Vec.getValueType();
13602 EVT VecEltVT = VecVT.getVectorElementType();
13603 EVT ResVT = N->getValueType(0);
13604
13605 unsigned VecSize = VecVT.getSizeInBits();
13606 unsigned VecEltSize = VecEltVT.getSizeInBits();
13607
13608 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13610 SDLoc SL(N);
13611 SDValue Idx = N->getOperand(1);
13612 SDValue Elt =
13613 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13614 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13615 }
13616
13617 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13618 // =>
13619 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13620 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13621 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13622 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13623 SDLoc SL(N);
13624 SDValue Idx = N->getOperand(1);
13625 unsigned Opc = Vec.getOpcode();
13626
13627 switch (Opc) {
13628 default:
13629 break;
13630 // TODO: Support other binary operations.
13631 case ISD::FADD:
13632 case ISD::FSUB:
13633 case ISD::FMUL:
13634 case ISD::ADD:
13635 case ISD::UMIN:
13636 case ISD::UMAX:
13637 case ISD::SMIN:
13638 case ISD::SMAX:
13639 case ISD::FMAXNUM:
13640 case ISD::FMINNUM:
13641 case ISD::FMAXNUM_IEEE:
13642 case ISD::FMINNUM_IEEE:
13643 case ISD::FMAXIMUM:
13644 case ISD::FMINIMUM: {
13645 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13646 Vec.getOperand(0), Idx);
13647 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13648 Vec.getOperand(1), Idx);
13649
13650 DCI.AddToWorklist(Elt0.getNode());
13651 DCI.AddToWorklist(Elt1.getNode());
13652 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13653 }
13654 }
13655 }
13656
13657 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13659 SDLoc SL(N);
13660 SDValue Idx = N->getOperand(1);
13661 SDValue V;
13662 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13663 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13664 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13665 if (I == 0)
13666 V = Elt;
13667 else
13668 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13669 }
13670 return V;
13671 }
13672
13673 if (!DCI.isBeforeLegalize())
13674 return SDValue();
13675
13676 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13677 // elements. This exposes more load reduction opportunities by replacing
13678 // multiple small extract_vector_elements with a single 32-bit extract.
13679 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13680 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13681 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13682 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13683
13684 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13685 unsigned EltIdx = BitIndex / 32;
13686 unsigned LeftoverBitIdx = BitIndex % 32;
13687 SDLoc SL(N);
13688
13689 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13690 DCI.AddToWorklist(Cast.getNode());
13691
13692 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13693 DAG.getConstant(EltIdx, SL, MVT::i32));
13694 DCI.AddToWorklist(Elt.getNode());
13695 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13696 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13697 DCI.AddToWorklist(Srl.getNode());
13698
13699 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13700 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13701 DCI.AddToWorklist(Trunc.getNode());
13702
13703 if (VecEltVT == ResVT) {
13704 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13705 }
13706
13707 assert(ResVT.isScalarInteger());
13708 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13709 }
13710
13711 return SDValue();
13712}
13713
13714SDValue
13715SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13716 DAGCombinerInfo &DCI) const {
13717 SDValue Vec = N->getOperand(0);
13718 SDValue Idx = N->getOperand(2);
13719 EVT VecVT = Vec.getValueType();
13720 EVT EltVT = VecVT.getVectorElementType();
13721
13722 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13723 // => BUILD_VECTOR n x select (e, const-idx)
13725 return SDValue();
13726
13727 SelectionDAG &DAG = DCI.DAG;
13728 SDLoc SL(N);
13729 SDValue Ins = N->getOperand(1);
13730 EVT IdxVT = Idx.getValueType();
13731
13733 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13734 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13735 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13736 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13737 Ops.push_back(V);
13738 }
13739
13740 return DAG.getBuildVector(VecVT, SL, Ops);
13741}
13742
13743/// Return the source of an fp_extend from f16 to f32, or a converted FP
13744/// constant.
13746 if (Src.getOpcode() == ISD::FP_EXTEND &&
13747 Src.getOperand(0).getValueType() == MVT::f16) {
13748 return Src.getOperand(0);
13749 }
13750
13751 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13752 APFloat Val = CFP->getValueAPF();
13753 bool LosesInfo = true;
13755 if (!LosesInfo)
13756 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13757 }
13758
13759 return SDValue();
13760}
13761
13762SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13763 DAGCombinerInfo &DCI) const {
13764 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13765 "combine only useful on gfx8");
13766
13767 SDValue TruncSrc = N->getOperand(0);
13768 EVT VT = N->getValueType(0);
13769 if (VT != MVT::f16)
13770 return SDValue();
13771
13772 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13773 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13774 return SDValue();
13775
13776 SelectionDAG &DAG = DCI.DAG;
13777 SDLoc SL(N);
13778
13779 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13780 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13781 // casting back.
13782
13783 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13784 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13785 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13786 if (!A)
13787 return SDValue();
13788
13789 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13790 if (!B)
13791 return SDValue();
13792
13793 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13794 if (!C)
13795 return SDValue();
13796
13797 // This changes signaling nan behavior. If an input is a signaling nan, it
13798 // would have been quieted by the fpext originally. We don't care because
13799 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13800 // we would be worse off than just doing the promotion.
13801 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13802 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13803 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13804 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13805}
13806
13807unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13808 const SDNode *N0,
13809 const SDNode *N1) const {
13810 EVT VT = N0->getValueType(0);
13811
13812 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13813 // support denormals ever.
13814 if (((VT == MVT::f32 &&
13816 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13819 return ISD::FMAD;
13820
13821 const TargetOptions &Options = DAG.getTarget().Options;
13822 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13823 (N0->getFlags().hasAllowContract() &&
13824 N1->getFlags().hasAllowContract())) &&
13826 return ISD::FMA;
13827 }
13828
13829 return 0;
13830}
13831
13832// For a reassociatable opcode perform:
13833// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13834SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13835 SelectionDAG &DAG) const {
13836 EVT VT = N->getValueType(0);
13837 if (VT != MVT::i32 && VT != MVT::i64)
13838 return SDValue();
13839
13840 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13841 return SDValue();
13842
13843 unsigned Opc = N->getOpcode();
13844 SDValue Op0 = N->getOperand(0);
13845 SDValue Op1 = N->getOperand(1);
13846
13847 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13848 return SDValue();
13849
13850 if (Op0->isDivergent())
13851 std::swap(Op0, Op1);
13852
13853 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13854 return SDValue();
13855
13856 SDValue Op2 = Op1.getOperand(1);
13857 Op1 = Op1.getOperand(0);
13858 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13859 return SDValue();
13860
13861 if (Op1->isDivergent())
13862 std::swap(Op1, Op2);
13863
13864 SDLoc SL(N);
13865 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13866 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13867}
13868
13869static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13870 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13872 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13873 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13874 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13875}
13876
13877// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13878// multiplies, if any.
13879//
13880// Full 64-bit multiplies that feed into an addition are lowered here instead
13881// of using the generic expansion. The generic expansion ends up with
13882// a tree of ADD nodes that prevents us from using the "add" part of the
13883// MAD instruction. The expansion produced here results in a chain of ADDs
13884// instead of a tree.
13885SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13886 DAGCombinerInfo &DCI) const {
13887 assert(N->getOpcode() == ISD::ADD);
13888
13889 SelectionDAG &DAG = DCI.DAG;
13890 EVT VT = N->getValueType(0);
13891 SDLoc SL(N);
13892 SDValue LHS = N->getOperand(0);
13893 SDValue RHS = N->getOperand(1);
13894
13895 if (VT.isVector())
13896 return SDValue();
13897
13898 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13899 // result in scalar registers for uniform values.
13900 if (!N->isDivergent() && Subtarget->hasSMulHi())
13901 return SDValue();
13902
13903 unsigned NumBits = VT.getScalarSizeInBits();
13904 if (NumBits <= 32 || NumBits > 64)
13905 return SDValue();
13906
13907 if (LHS.getOpcode() != ISD::MUL) {
13908 assert(RHS.getOpcode() == ISD::MUL);
13909 std::swap(LHS, RHS);
13910 }
13911
13912 // Avoid the fold if it would unduly increase the number of multiplies due to
13913 // multiple uses, except on hardware with full-rate multiply-add (which is
13914 // part of full-rate 64-bit ops).
13915 if (!Subtarget->hasFullRate64Ops()) {
13916 unsigned NumUsers = 0;
13917 for (SDNode *User : LHS->users()) {
13918 // There is a use that does not feed into addition, so the multiply can't
13919 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13920 if (User->getOpcode() != ISD::ADD)
13921 return SDValue();
13922
13923 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13924 // MUL + 3xADD + 3xADDC over 3xMAD.
13925 ++NumUsers;
13926 if (NumUsers >= 3)
13927 return SDValue();
13928 }
13929 }
13930
13931 SDValue MulLHS = LHS.getOperand(0);
13932 SDValue MulRHS = LHS.getOperand(1);
13933 SDValue AddRHS = RHS;
13934
13935 // Always check whether operands are small unsigned values, since that
13936 // knowledge is useful in more cases. Check for small signed values only if
13937 // doing so can unlock a shorter code sequence.
13938 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13939 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13940
13941 bool MulSignedLo = false;
13942 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13943 MulSignedLo =
13944 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
13945 }
13946
13947 // The operands and final result all have the same number of bits. If
13948 // operands need to be extended, they can be extended with garbage. The
13949 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13950 // truncated away in the end.
13951 if (VT != MVT::i64) {
13952 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13953 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13954 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13955 }
13956
13957 // The basic code generated is conceptually straightforward. Pseudo code:
13958 //
13959 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13960 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13961 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13962 //
13963 // The second and third lines are optional, depending on whether the factors
13964 // are {sign,zero}-extended or not.
13965 //
13966 // The actual DAG is noisier than the pseudo code, but only due to
13967 // instructions that disassemble values into low and high parts, and
13968 // assemble the final result.
13969 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13970
13971 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13972 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13973 SDValue Accum =
13974 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13975
13976 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13977 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13978
13979 if (!MulLHSUnsigned32) {
13980 auto MulLHSHi =
13981 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13982 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13983 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13984 }
13985
13986 if (!MulRHSUnsigned32) {
13987 auto MulRHSHi =
13988 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13989 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13990 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13991 }
13992
13993 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13994 Accum = DAG.getBitcast(MVT::i64, Accum);
13995 }
13996
13997 if (VT != MVT::i64)
13998 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13999 return Accum;
14000}
14001
14002SDValue
14003SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14004 DAGCombinerInfo &DCI) const {
14005 SDValue RHS = N->getOperand(1);
14006 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14007 if (!CRHS)
14008 return SDValue();
14009
14010 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14011 // common.
14012 uint64_t Val = CRHS->getZExtValue();
14013 if (countr_zero(Val) >= 32) {
14014 SelectionDAG &DAG = DCI.DAG;
14015 SDLoc SL(N);
14016 SDValue LHS = N->getOperand(0);
14017
14018 // Avoid carry machinery if we know the low half of the add does not
14019 // contribute to the final result.
14020 //
14021 // add i64:x, K if computeTrailingZeros(K) >= 32
14022 // => build_pair (add x.hi, K.hi), x.lo
14023
14024 // Breaking the 64-bit add here with this strange constant is unlikely
14025 // to interfere with addressing mode patterns.
14026
14027 SDValue Hi = getHiHalf64(LHS, DAG);
14028 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14029 SDValue AddHi =
14030 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14031
14032 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14033 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14034 }
14035
14036 return SDValue();
14037}
14038
14039// Collect the ultimate src of each of the mul node's operands, and confirm
14040// each operand is 8 bytes.
14041static std::optional<ByteProvider<SDValue>>
14042handleMulOperand(const SDValue &MulOperand) {
14043 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14044 if (!Byte0 || Byte0->isConstantZero()) {
14045 return std::nullopt;
14046 }
14047 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14048 if (Byte1 && !Byte1->isConstantZero()) {
14049 return std::nullopt;
14050 }
14051 return Byte0;
14052}
14053
14054static unsigned addPermMasks(unsigned First, unsigned Second) {
14055 unsigned FirstCs = First & 0x0c0c0c0c;
14056 unsigned SecondCs = Second & 0x0c0c0c0c;
14057 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14058 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14059
14060 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14061 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14062 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14063 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14064
14065 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14066}
14067
14068struct DotSrc {
14070 int64_t PermMask;
14072};
14073
14077 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14078
14079 assert(Src0.Src.has_value() && Src1.Src.has_value());
14080 // Src0s and Src1s are empty, just place arbitrarily.
14081 if (Step == 0) {
14082 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14083 Src0.SrcOffset / 4});
14084 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14085 Src1.SrcOffset / 4});
14086 return;
14087 }
14088
14089 for (int BPI = 0; BPI < 2; BPI++) {
14090 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14091 if (BPI == 1) {
14092 BPP = {Src1, Src0};
14093 }
14094 unsigned ZeroMask = 0x0c0c0c0c;
14095 unsigned FMask = 0xFF << (8 * (3 - Step));
14096
14097 unsigned FirstMask =
14098 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14099 unsigned SecondMask =
14100 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14101 // Attempt to find Src vector which contains our SDValue, if so, add our
14102 // perm mask to the existing one. If we are unable to find a match for the
14103 // first SDValue, attempt to find match for the second.
14104 int FirstGroup = -1;
14105 for (int I = 0; I < 2; I++) {
14106 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14107 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14108 return IterElt.SrcOp == *BPP.first.Src &&
14109 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14110 };
14111
14112 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14113 if (Match != Srcs.end()) {
14114 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14115 FirstGroup = I;
14116 break;
14117 }
14118 }
14119 if (FirstGroup != -1) {
14120 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14121 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14122 return IterElt.SrcOp == *BPP.second.Src &&
14123 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14124 };
14125 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14126 if (Match != Srcs.end()) {
14127 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14128 } else
14129 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14130 return;
14131 }
14132 }
14133
14134 // If we have made it here, then we could not find a match in Src0s or Src1s
14135 // for either Src0 or Src1, so just place them arbitrarily.
14136
14137 unsigned ZeroMask = 0x0c0c0c0c;
14138 unsigned FMask = 0xFF << (8 * (3 - Step));
14139
14140 Src0s.push_back(
14141 {*Src0.Src,
14142 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14143 Src0.SrcOffset / 4});
14144 Src1s.push_back(
14145 {*Src1.Src,
14146 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14147 Src1.SrcOffset / 4});
14148}
14149
14151 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14152 bool IsAny) {
14153
14154 // If we just have one source, just permute it accordingly.
14155 if (Srcs.size() == 1) {
14156 auto *Elt = Srcs.begin();
14157 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14158
14159 // v_perm will produce the original value
14160 if (Elt->PermMask == 0x3020100)
14161 return EltOp;
14162
14163 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14164 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14165 }
14166
14167 auto *FirstElt = Srcs.begin();
14168 auto *SecondElt = std::next(FirstElt);
14169
14171
14172 // If we have multiple sources in the chain, combine them via perms (using
14173 // calculated perm mask) and Ors.
14174 while (true) {
14175 auto FirstMask = FirstElt->PermMask;
14176 auto SecondMask = SecondElt->PermMask;
14177
14178 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14179 unsigned FirstPlusFour = FirstMask | 0x04040404;
14180 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14181 // original 0x0C.
14182 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14183
14184 auto PermMask = addPermMasks(FirstMask, SecondMask);
14185 auto FirstVal =
14186 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14187 auto SecondVal =
14188 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14189
14190 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14191 SecondVal,
14192 DAG.getConstant(PermMask, SL, MVT::i32)));
14193
14194 FirstElt = std::next(SecondElt);
14195 if (FirstElt == Srcs.end())
14196 break;
14197
14198 SecondElt = std::next(FirstElt);
14199 // If we only have a FirstElt, then just combine that into the cumulative
14200 // source node.
14201 if (SecondElt == Srcs.end()) {
14202 auto EltOp =
14203 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14204
14205 Perms.push_back(
14206 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14207 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14208 break;
14209 }
14210 }
14211
14212 assert(Perms.size() == 1 || Perms.size() == 2);
14213 return Perms.size() == 2
14214 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14215 : Perms[0];
14216}
14217
14218static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14219 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14220 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14221 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14222 EntryMask += ZeroMask;
14223 }
14224}
14225
14226static bool isMul(const SDValue Op) {
14227 auto Opcode = Op.getOpcode();
14228
14229 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14230 Opcode == AMDGPUISD::MUL_I24);
14231}
14232
14233static std::optional<bool>
14235 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14236 const SDValue &S1Op, const SelectionDAG &DAG) {
14237 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14238 // of the dot4 is irrelevant.
14239 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14240 return false;
14241
14242 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14243 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14244 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14245 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14246 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14247 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14248
14249 assert(!(S0IsUnsigned && S0IsSigned));
14250 assert(!(S1IsUnsigned && S1IsSigned));
14251
14252 // There are 9 possible permutations of
14253 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14254
14255 // In two permutations, the sign bits are known to be the same for both Ops,
14256 // so simply return Signed / Unsigned corresponding to the MSB
14257
14258 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14259 return S0IsSigned;
14260
14261 // In another two permutations, the sign bits are known to be opposite. In
14262 // this case return std::nullopt to indicate a bad match.
14263
14264 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14265 return std::nullopt;
14266
14267 // In the remaining five permutations, we don't know the value of the sign
14268 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14269 // the upper bits must be extension bits. Thus, the only ways for the sign
14270 // bit to be unknown is if it was sign extended from unknown value, or if it
14271 // was any extended. In either case, it is correct to use the signed
14272 // version of the signedness semantics of dot4
14273
14274 // In two of such permutations, we known the sign bit is set for
14275 // one op, and the other is unknown. It is okay to used signed version of
14276 // dot4.
14277 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14278 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14279 return true;
14280
14281 // In one such permutation, we don't know either of the sign bits. It is okay
14282 // to used the signed version of dot4.
14283 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14284 return true;
14285
14286 // In two of such permutations, we known the sign bit is unset for
14287 // one op, and the other is unknown. Return std::nullopt to indicate a
14288 // bad match.
14289 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14290 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14291 return std::nullopt;
14292
14293 llvm_unreachable("Fully covered condition");
14294}
14295
14296SDValue SITargetLowering::performAddCombine(SDNode *N,
14297 DAGCombinerInfo &DCI) const {
14298 SelectionDAG &DAG = DCI.DAG;
14299 EVT VT = N->getValueType(0);
14300 SDLoc SL(N);
14301 SDValue LHS = N->getOperand(0);
14302 SDValue RHS = N->getOperand(1);
14303
14304 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14305 if (Subtarget->hasMad64_32()) {
14306 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14307 return Folded;
14308 }
14309 }
14310
14311 if (SDValue V = reassociateScalarOps(N, DAG)) {
14312 return V;
14313 }
14314
14315 if (VT == MVT::i64) {
14316 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14317 return Folded;
14318 }
14319
14320 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14321 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14322 SDValue TempNode(N, 0);
14323 std::optional<bool> IsSigned;
14327
14328 // Match the v_dot4 tree, while collecting src nodes.
14329 int ChainLength = 0;
14330 for (int I = 0; I < 4; I++) {
14331 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14332 if (MulIdx == -1)
14333 break;
14334 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14335 if (!Src0)
14336 break;
14337 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14338 if (!Src1)
14339 break;
14340
14341 auto IterIsSigned = checkDot4MulSignedness(
14342 TempNode->getOperand(MulIdx), *Src0, *Src1,
14343 TempNode->getOperand(MulIdx)->getOperand(0),
14344 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14345 if (!IterIsSigned)
14346 break;
14347 if (!IsSigned)
14348 IsSigned = *IterIsSigned;
14349 if (*IterIsSigned != *IsSigned)
14350 break;
14351 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14352 auto AddIdx = 1 - MulIdx;
14353 // Allow the special case where add (add (mul24, 0), mul24) became ->
14354 // add (mul24, mul24).
14355 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14356 Src2s.push_back(TempNode->getOperand(AddIdx));
14357 auto Src0 =
14358 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14359 if (!Src0)
14360 break;
14361 auto Src1 =
14362 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14363 if (!Src1)
14364 break;
14365 auto IterIsSigned = checkDot4MulSignedness(
14366 TempNode->getOperand(AddIdx), *Src0, *Src1,
14367 TempNode->getOperand(AddIdx)->getOperand(0),
14368 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14369 if (!IterIsSigned)
14370 break;
14371 assert(IsSigned);
14372 if (*IterIsSigned != *IsSigned)
14373 break;
14374 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14375 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14376 ChainLength = I + 2;
14377 break;
14378 }
14379
14380 TempNode = TempNode->getOperand(AddIdx);
14381 Src2s.push_back(TempNode);
14382 ChainLength = I + 1;
14383 if (TempNode->getNumOperands() < 2)
14384 break;
14385 LHS = TempNode->getOperand(0);
14386 RHS = TempNode->getOperand(1);
14387 }
14388
14389 if (ChainLength < 2)
14390 return SDValue();
14391
14392 // Masks were constructed with assumption that we would find a chain of
14393 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14394 // 0x0c) so they do not affect dot calculation.
14395 if (ChainLength < 4) {
14396 fixMasks(Src0s, ChainLength);
14397 fixMasks(Src1s, ChainLength);
14398 }
14399
14400 SDValue Src0, Src1;
14401
14402 // If we are just using a single source for both, and have permuted the
14403 // bytes consistently, we can just use the sources without permuting
14404 // (commutation).
14405 bool UseOriginalSrc = false;
14406 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14407 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14408 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14409 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14410 SmallVector<unsigned, 4> SrcBytes;
14411 auto Src0Mask = Src0s.begin()->PermMask;
14412 SrcBytes.push_back(Src0Mask & 0xFF000000);
14413 bool UniqueEntries = true;
14414 for (auto I = 1; I < 4; I++) {
14415 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14416
14417 if (is_contained(SrcBytes, NextByte)) {
14418 UniqueEntries = false;
14419 break;
14420 }
14421 SrcBytes.push_back(NextByte);
14422 }
14423
14424 if (UniqueEntries) {
14425 UseOriginalSrc = true;
14426
14427 auto *FirstElt = Src0s.begin();
14428 auto FirstEltOp =
14429 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14430
14431 auto *SecondElt = Src1s.begin();
14432 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14433 SecondElt->DWordOffset);
14434
14435 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14436 MVT::getIntegerVT(32));
14437 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14438 MVT::getIntegerVT(32));
14439 }
14440 }
14441
14442 if (!UseOriginalSrc) {
14443 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14444 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14445 }
14446
14447 assert(IsSigned);
14448 SDValue Src2 =
14449 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14450
14451 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14452 : Intrinsic::amdgcn_udot4,
14453 SL, MVT::i64);
14454
14455 assert(!VT.isVector());
14456 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14457 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14458
14459 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14460 }
14461
14462 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14463 return SDValue();
14464
14465 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14466 // add x, sext (setcc) => usubo_carry x, 0, setcc
14467 unsigned Opc = LHS.getOpcode();
14468 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14469 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14470 std::swap(RHS, LHS);
14471
14472 Opc = RHS.getOpcode();
14473 switch (Opc) {
14474 default:
14475 break;
14476 case ISD::ZERO_EXTEND:
14477 case ISD::SIGN_EXTEND:
14478 case ISD::ANY_EXTEND: {
14479 auto Cond = RHS.getOperand(0);
14480 // If this won't be a real VOPC output, we would still need to insert an
14481 // extra instruction anyway.
14482 if (!isBoolSGPR(Cond))
14483 break;
14484 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14485 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14487 return DAG.getNode(Opc, SL, VTList, Args);
14488 }
14489 case ISD::UADDO_CARRY: {
14490 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14491 if (!isNullConstant(RHS.getOperand(1)))
14492 break;
14493 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14494 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14495 }
14496 }
14497 return SDValue();
14498}
14499
14500SDValue SITargetLowering::performSubCombine(SDNode *N,
14501 DAGCombinerInfo &DCI) const {
14502 SelectionDAG &DAG = DCI.DAG;
14503 EVT VT = N->getValueType(0);
14504
14505 if (VT == MVT::i64) {
14506 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14507 return Folded;
14508 }
14509
14510 if (VT != MVT::i32)
14511 return SDValue();
14512
14513 SDLoc SL(N);
14514 SDValue LHS = N->getOperand(0);
14515 SDValue RHS = N->getOperand(1);
14516
14517 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14518 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14519 unsigned Opc = RHS.getOpcode();
14520 switch (Opc) {
14521 default:
14522 break;
14523 case ISD::ZERO_EXTEND:
14524 case ISD::SIGN_EXTEND:
14525 case ISD::ANY_EXTEND: {
14526 auto Cond = RHS.getOperand(0);
14527 // If this won't be a real VOPC output, we would still need to insert an
14528 // extra instruction anyway.
14529 if (!isBoolSGPR(Cond))
14530 break;
14531 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14532 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14534 return DAG.getNode(Opc, SL, VTList, Args);
14535 }
14536 }
14537
14538 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14539 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14540 if (!isNullConstant(LHS.getOperand(1)))
14541 return SDValue();
14542 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14543 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14544 }
14545 return SDValue();
14546}
14547
14548SDValue
14549SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14550 DAGCombinerInfo &DCI) const {
14551
14552 if (N->getValueType(0) != MVT::i32)
14553 return SDValue();
14554
14555 if (!isNullConstant(N->getOperand(1)))
14556 return SDValue();
14557
14558 SelectionDAG &DAG = DCI.DAG;
14559 SDValue LHS = N->getOperand(0);
14560
14561 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14562 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14563 unsigned LHSOpc = LHS.getOpcode();
14564 unsigned Opc = N->getOpcode();
14565 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14566 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14567 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14568 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14569 }
14570 return SDValue();
14571}
14572
14573SDValue SITargetLowering::performFAddCombine(SDNode *N,
14574 DAGCombinerInfo &DCI) const {
14575 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14576 return SDValue();
14577
14578 SelectionDAG &DAG = DCI.DAG;
14579 EVT VT = N->getValueType(0);
14580
14581 SDLoc SL(N);
14582 SDValue LHS = N->getOperand(0);
14583 SDValue RHS = N->getOperand(1);
14584
14585 // These should really be instruction patterns, but writing patterns with
14586 // source modifiers is a pain.
14587
14588 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14589 if (LHS.getOpcode() == ISD::FADD) {
14590 SDValue A = LHS.getOperand(0);
14591 if (A == LHS.getOperand(1)) {
14592 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14593 if (FusedOp != 0) {
14594 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14595 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14596 }
14597 }
14598 }
14599
14600 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14601 if (RHS.getOpcode() == ISD::FADD) {
14602 SDValue A = RHS.getOperand(0);
14603 if (A == RHS.getOperand(1)) {
14604 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14605 if (FusedOp != 0) {
14606 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14607 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14608 }
14609 }
14610 }
14611
14612 return SDValue();
14613}
14614
14615SDValue SITargetLowering::performFSubCombine(SDNode *N,
14616 DAGCombinerInfo &DCI) const {
14617 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14618 return SDValue();
14619
14620 SelectionDAG &DAG = DCI.DAG;
14621 SDLoc SL(N);
14622 EVT VT = N->getValueType(0);
14623 assert(!VT.isVector());
14624
14625 // Try to get the fneg to fold into the source modifier. This undoes generic
14626 // DAG combines and folds them into the mad.
14627 //
14628 // Only do this if we are not trying to support denormals. v_mad_f32 does
14629 // not support denormals ever.
14630 SDValue LHS = N->getOperand(0);
14631 SDValue RHS = N->getOperand(1);
14632 if (LHS.getOpcode() == ISD::FADD) {
14633 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14634 SDValue A = LHS.getOperand(0);
14635 if (A == LHS.getOperand(1)) {
14636 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14637 if (FusedOp != 0) {
14638 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14639 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14640
14641 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14642 }
14643 }
14644 }
14645
14646 if (RHS.getOpcode() == ISD::FADD) {
14647 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14648
14649 SDValue A = RHS.getOperand(0);
14650 if (A == RHS.getOperand(1)) {
14651 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14652 if (FusedOp != 0) {
14653 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14654 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14655 }
14656 }
14657 }
14658
14659 return SDValue();
14660}
14661
14662SDValue SITargetLowering::performFDivCombine(SDNode *N,
14663 DAGCombinerInfo &DCI) const {
14664 SelectionDAG &DAG = DCI.DAG;
14665 SDLoc SL(N);
14666 EVT VT = N->getValueType(0);
14667 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14668 return SDValue();
14669
14670 SDValue LHS = N->getOperand(0);
14671 SDValue RHS = N->getOperand(1);
14672
14673 SDNodeFlags Flags = N->getFlags();
14674 SDNodeFlags RHSFlags = RHS->getFlags();
14675 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14676 !RHS->hasOneUse())
14677 return SDValue();
14678
14679 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14680 bool IsNegative = false;
14681 if (CLHS->isExactlyValue(1.0) ||
14682 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14683 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14684 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14685 if (RHS.getOpcode() == ISD::FSQRT) {
14686 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14687 SDValue Rsq =
14688 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14689 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14690 }
14691 }
14692 }
14693
14694 return SDValue();
14695}
14696
14697SDValue SITargetLowering::performFMulCombine(SDNode *N,
14698 DAGCombinerInfo &DCI) const {
14699 SelectionDAG &DAG = DCI.DAG;
14700 EVT VT = N->getValueType(0);
14701 EVT ScalarVT = VT.getScalarType();
14702 EVT IntVT = VT.changeElementType(MVT::i32);
14703
14704 SDValue LHS = N->getOperand(0);
14705 SDValue RHS = N->getOperand(1);
14706
14707 // It is cheaper to realize i32 inline constants as compared against
14708 // materializing f16 or f64 (or even non-inline f32) values,
14709 // possible via ldexp usage, as shown below :
14710 //
14711 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14712 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14713 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14714 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14715 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14716 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14717 if (!TrueNode)
14718 return SDValue();
14719 const ConstantFPSDNode *FalseNode =
14720 isConstOrConstSplatFP(RHS.getOperand(2));
14721 if (!FalseNode)
14722 return SDValue();
14723
14724 if (TrueNode->isNegative() != FalseNode->isNegative())
14725 return SDValue();
14726
14727 // For f32, only non-inline constants should be transformed.
14729 if (ScalarVT == MVT::f32 &&
14730 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14731 TII->isInlineConstant(FalseNode->getValueAPF()))
14732 return SDValue();
14733
14734 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14735 if (TrueNodeExpVal == INT_MIN)
14736 return SDValue();
14737 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14738 if (FalseNodeExpVal == INT_MIN)
14739 return SDValue();
14740
14741 SDLoc SL(N);
14742 SDValue SelectNode =
14743 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14744 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14745 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14746
14747 LHS = TrueNode->isNegative()
14748 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14749 : LHS;
14750
14751 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14752 }
14753
14754 return SDValue();
14755}
14756
14757SDValue SITargetLowering::performFMACombine(SDNode *N,
14758 DAGCombinerInfo &DCI) const {
14759 SelectionDAG &DAG = DCI.DAG;
14760 EVT VT = N->getValueType(0);
14761 SDLoc SL(N);
14762
14763 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14764 return SDValue();
14765
14766 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14767 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14768 SDValue Op1 = N->getOperand(0);
14769 SDValue Op2 = N->getOperand(1);
14770 SDValue FMA = N->getOperand(2);
14771
14772 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14773 Op2.getOpcode() != ISD::FP_EXTEND)
14774 return SDValue();
14775
14776 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14777 // regardless of the denorm mode setting. Therefore,
14778 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14779 const TargetOptions &Options = DAG.getTarget().Options;
14780 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14781 (N->getFlags().hasAllowContract() &&
14782 FMA->getFlags().hasAllowContract())) {
14783 Op1 = Op1.getOperand(0);
14784 Op2 = Op2.getOperand(0);
14785 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14787 return SDValue();
14788
14789 SDValue Vec1 = Op1.getOperand(0);
14790 SDValue Idx1 = Op1.getOperand(1);
14791 SDValue Vec2 = Op2.getOperand(0);
14792
14793 SDValue FMAOp1 = FMA.getOperand(0);
14794 SDValue FMAOp2 = FMA.getOperand(1);
14795 SDValue FMAAcc = FMA.getOperand(2);
14796
14797 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14798 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14799 return SDValue();
14800
14801 FMAOp1 = FMAOp1.getOperand(0);
14802 FMAOp2 = FMAOp2.getOperand(0);
14803 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14805 return SDValue();
14806
14807 SDValue Vec3 = FMAOp1.getOperand(0);
14808 SDValue Vec4 = FMAOp2.getOperand(0);
14809 SDValue Idx2 = FMAOp1.getOperand(1);
14810
14811 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14812 // Idx1 and Idx2 cannot be the same.
14813 Idx1 == Idx2)
14814 return SDValue();
14815
14816 if (Vec1 == Vec2 || Vec3 == Vec4)
14817 return SDValue();
14818
14819 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14820 return SDValue();
14821
14822 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14823 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14824 DAG.getTargetConstant(0, SL, MVT::i1));
14825 }
14826 }
14827 return SDValue();
14828}
14829
14830SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14831 DAGCombinerInfo &DCI) const {
14832 SelectionDAG &DAG = DCI.DAG;
14833 SDLoc SL(N);
14834
14835 SDValue LHS = N->getOperand(0);
14836 SDValue RHS = N->getOperand(1);
14837 EVT VT = LHS.getValueType();
14838 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14839
14840 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14841 if (!CRHS) {
14842 CRHS = dyn_cast<ConstantSDNode>(LHS);
14843 if (CRHS) {
14844 std::swap(LHS, RHS);
14846 }
14847 }
14848
14849 if (CRHS) {
14850 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14851 isBoolSGPR(LHS.getOperand(0))) {
14852 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14853 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14854 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14855 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14856 if ((CRHS->isAllOnes() &&
14857 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14858 (CRHS->isZero() &&
14859 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14860 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14861 DAG.getAllOnesConstant(SL, MVT::i1));
14862 if ((CRHS->isAllOnes() &&
14863 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14864 (CRHS->isZero() &&
14865 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14866 return LHS.getOperand(0);
14867 }
14868
14869 const APInt &CRHSVal = CRHS->getAPIntValue();
14870 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14871 LHS.getOpcode() == ISD::SELECT &&
14872 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14873 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14874 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14875 isBoolSGPR(LHS.getOperand(0))) {
14876 // Given CT != FT:
14877 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14878 // setcc (select cc, CT, CF), CF, ne => cc
14879 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14880 // setcc (select cc, CT, CF), CT, eq => cc
14881 const APInt &CT = LHS.getConstantOperandAPInt(1);
14882 const APInt &CF = LHS.getConstantOperandAPInt(2);
14883
14884 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14885 (CT == CRHSVal && CC == ISD::SETNE))
14886 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14887 DAG.getAllOnesConstant(SL, MVT::i1));
14888 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14889 (CT == CRHSVal && CC == ISD::SETEQ))
14890 return LHS.getOperand(0);
14891 }
14892 }
14893
14894 if (VT != MVT::f32 && VT != MVT::f64 &&
14895 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14896 return SDValue();
14897
14898 // Match isinf/isfinite pattern
14899 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14900 // (fcmp one (fabs x), inf) -> (fp_class x,
14901 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14902 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
14903 LHS.getOpcode() == ISD::FABS) {
14904 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14905 if (!CRHS)
14906 return SDValue();
14907
14908 const APFloat &APF = CRHS->getValueAPF();
14909 if (APF.isInfinity() && !APF.isNegative()) {
14910 const unsigned IsInfMask =
14912 const unsigned IsFiniteMask =
14916 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14917 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14918 DAG.getConstant(Mask, SL, MVT::i32));
14919 }
14920 }
14921
14922 return SDValue();
14923}
14924
14925SDValue
14926SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14927 DAGCombinerInfo &DCI) const {
14928 SelectionDAG &DAG = DCI.DAG;
14929 SDLoc SL(N);
14930 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14931
14932 SDValue Src = N->getOperand(0);
14933 SDValue Shift = N->getOperand(0);
14934
14935 // TODO: Extend type shouldn't matter (assuming legal types).
14936 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14937 Shift = Shift.getOperand(0);
14938
14939 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14940 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14941 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14942 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14943 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14944 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14945 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14946 SDValue Shifted = DAG.getZExtOrTrunc(
14947 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
14948
14949 unsigned ShiftOffset = 8 * Offset;
14950 if (Shift.getOpcode() == ISD::SHL)
14951 ShiftOffset -= C->getZExtValue();
14952 else
14953 ShiftOffset += C->getZExtValue();
14954
14955 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14956 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14957 MVT::f32, Shifted);
14958 }
14959 }
14960 }
14961
14962 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14963 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14964 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14965 // We simplified Src. If this node is not dead, visit it again so it is
14966 // folded properly.
14967 if (N->getOpcode() != ISD::DELETED_NODE)
14968 DCI.AddToWorklist(N);
14969 return SDValue(N, 0);
14970 }
14971
14972 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14973 if (SDValue DemandedSrc =
14975 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14976
14977 return SDValue();
14978}
14979
14980SDValue SITargetLowering::performClampCombine(SDNode *N,
14981 DAGCombinerInfo &DCI) const {
14982 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14983 if (!CSrc)
14984 return SDValue();
14985
14986 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14987 const APFloat &F = CSrc->getValueAPF();
14988 APFloat Zero = APFloat::getZero(F.getSemantics());
14989 if (F < Zero ||
14990 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14991 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14992 }
14993
14994 APFloat One(F.getSemantics(), "1.0");
14995 if (F > One)
14996 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14997
14998 return SDValue(CSrc, 0);
14999}
15000
15002 DAGCombinerInfo &DCI) const {
15003 switch (N->getOpcode()) {
15004 case ISD::ADD:
15005 case ISD::SUB:
15006 case ISD::SHL:
15007 case ISD::SRL:
15008 case ISD::SRA:
15009 case ISD::AND:
15010 case ISD::OR:
15011 case ISD::XOR:
15012 case ISD::MUL:
15013 case ISD::SETCC:
15014 case ISD::SELECT:
15015 case ISD::SMIN:
15016 case ISD::SMAX:
15017 case ISD::UMIN:
15018 case ISD::UMAX:
15019 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15020 return Res;
15021 break;
15022 default:
15023 break;
15024 }
15025
15026 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15027 return SDValue();
15028
15029 switch (N->getOpcode()) {
15030 case ISD::ADD:
15031 return performAddCombine(N, DCI);
15032 case ISD::SUB:
15033 return performSubCombine(N, DCI);
15034 case ISD::UADDO_CARRY:
15035 case ISD::USUBO_CARRY:
15036 return performAddCarrySubCarryCombine(N, DCI);
15037 case ISD::FADD:
15038 return performFAddCombine(N, DCI);
15039 case ISD::FSUB:
15040 return performFSubCombine(N, DCI);
15041 case ISD::FDIV:
15042 return performFDivCombine(N, DCI);
15043 case ISD::FMUL:
15044 return performFMulCombine(N, DCI);
15045 case ISD::SETCC:
15046 return performSetCCCombine(N, DCI);
15047 case ISD::FMAXNUM:
15048 case ISD::FMINNUM:
15049 case ISD::FMAXNUM_IEEE:
15050 case ISD::FMINNUM_IEEE:
15051 case ISD::FMAXIMUM:
15052 case ISD::FMINIMUM:
15053 case ISD::SMAX:
15054 case ISD::SMIN:
15055 case ISD::UMAX:
15056 case ISD::UMIN:
15059 return performMinMaxCombine(N, DCI);
15060 case ISD::FMA:
15061 return performFMACombine(N, DCI);
15062 case ISD::AND:
15063 return performAndCombine(N, DCI);
15064 case ISD::OR:
15065 return performOrCombine(N, DCI);
15066 case ISD::FSHR: {
15068 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15069 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15070 return matchPERM(N, DCI);
15071 }
15072 break;
15073 }
15074 case ISD::XOR:
15075 return performXorCombine(N, DCI);
15076 case ISD::ZERO_EXTEND:
15077 return performZeroExtendCombine(N, DCI);
15079 return performSignExtendInRegCombine(N, DCI);
15081 return performClassCombine(N, DCI);
15082 case ISD::FCANONICALIZE:
15083 return performFCanonicalizeCombine(N, DCI);
15084 case AMDGPUISD::RCP:
15085 return performRcpCombine(N, DCI);
15086 case ISD::FLDEXP:
15087 case AMDGPUISD::FRACT:
15088 case AMDGPUISD::RSQ:
15091 case AMDGPUISD::RSQ_CLAMP: {
15092 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15093 SDValue Src = N->getOperand(0);
15094 if (Src.isUndef())
15095 return Src;
15096 break;
15097 }
15098 case ISD::SINT_TO_FP:
15099 case ISD::UINT_TO_FP:
15100 return performUCharToFloatCombine(N, DCI);
15101 case ISD::FCOPYSIGN:
15102 return performFCopySignCombine(N, DCI);
15107 return performCvtF32UByteNCombine(N, DCI);
15108 case AMDGPUISD::FMED3:
15109 return performFMed3Combine(N, DCI);
15111 return performCvtPkRTZCombine(N, DCI);
15112 case AMDGPUISD::CLAMP:
15113 return performClampCombine(N, DCI);
15114 case ISD::SCALAR_TO_VECTOR: {
15115 SelectionDAG &DAG = DCI.DAG;
15116 EVT VT = N->getValueType(0);
15117
15118 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15119 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15120 SDLoc SL(N);
15121 SDValue Src = N->getOperand(0);
15122 EVT EltVT = Src.getValueType();
15123 if (EltVT != MVT::i16)
15124 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15125
15126 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15127 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15128 }
15129
15130 break;
15131 }
15133 return performExtractVectorEltCombine(N, DCI);
15135 return performInsertVectorEltCombine(N, DCI);
15136 case ISD::FP_ROUND:
15137 return performFPRoundCombine(N, DCI);
15138 case ISD::LOAD: {
15139 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15140 return Widened;
15141 [[fallthrough]];
15142 }
15143 default: {
15144 if (!DCI.isBeforeLegalize()) {
15145 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15146 return performMemSDNodeCombine(MemNode, DCI);
15147 }
15148
15149 break;
15150 }
15151 }
15152
15154}
15155
15156/// Helper function for adjustWritemask
15157static unsigned SubIdx2Lane(unsigned Idx) {
15158 switch (Idx) {
15159 default:
15160 return ~0u;
15161 case AMDGPU::sub0:
15162 return 0;
15163 case AMDGPU::sub1:
15164 return 1;
15165 case AMDGPU::sub2:
15166 return 2;
15167 case AMDGPU::sub3:
15168 return 3;
15169 case AMDGPU::sub4:
15170 return 4; // Possible with TFE/LWE
15171 }
15172}
15173
15174/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15175SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15176 SelectionDAG &DAG) const {
15177 unsigned Opcode = Node->getMachineOpcode();
15178
15179 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15180 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15181 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15182 return Node; // not implemented for D16
15183
15184 SDNode *Users[5] = {nullptr};
15185 unsigned Lane = 0;
15186 unsigned DmaskIdx =
15187 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15188 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15189 unsigned NewDmask = 0;
15190 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15191 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15192 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15193 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15194 ? true
15195 : false;
15196 unsigned TFCLane = 0;
15197 bool HasChain = Node->getNumValues() > 1;
15198
15199 if (OldDmask == 0) {
15200 // These are folded out, but on the chance it happens don't assert.
15201 return Node;
15202 }
15203
15204 unsigned OldBitsSet = llvm::popcount(OldDmask);
15205 // Work out which is the TFE/LWE lane if that is enabled.
15206 if (UsesTFC) {
15207 TFCLane = OldBitsSet;
15208 }
15209
15210 // Try to figure out the used register components
15211 for (SDUse &Use : Node->uses()) {
15212
15213 // Don't look at users of the chain.
15214 if (Use.getResNo() != 0)
15215 continue;
15216
15217 SDNode *User = Use.getUser();
15218
15219 // Abort if we can't understand the usage
15220 if (!User->isMachineOpcode() ||
15221 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15222 return Node;
15223
15224 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15225 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15226 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15227 // set, etc.
15228 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15229 if (Lane == ~0u)
15230 return Node;
15231
15232 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15233 if (UsesTFC && Lane == TFCLane) {
15234 Users[Lane] = User;
15235 } else {
15236 // Set which texture component corresponds to the lane.
15237 unsigned Comp;
15238 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15239 Comp = llvm::countr_zero(Dmask);
15240 Dmask &= ~(1 << Comp);
15241 }
15242
15243 // Abort if we have more than one user per component.
15244 if (Users[Lane])
15245 return Node;
15246
15247 Users[Lane] = User;
15248 NewDmask |= 1 << Comp;
15249 }
15250 }
15251
15252 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15253 bool NoChannels = !NewDmask;
15254 if (NoChannels) {
15255 if (!UsesTFC) {
15256 // No uses of the result and not using TFC. Then do nothing.
15257 return Node;
15258 }
15259 // If the original dmask has one channel - then nothing to do
15260 if (OldBitsSet == 1)
15261 return Node;
15262 // Use an arbitrary dmask - required for the instruction to work
15263 NewDmask = 1;
15264 }
15265 // Abort if there's no change
15266 if (NewDmask == OldDmask)
15267 return Node;
15268
15269 unsigned BitsSet = llvm::popcount(NewDmask);
15270
15271 // Check for TFE or LWE - increase the number of channels by one to account
15272 // for the extra return value
15273 // This will need adjustment for D16 if this is also included in
15274 // adjustWriteMask (this function) but at present D16 are excluded.
15275 unsigned NewChannels = BitsSet + UsesTFC;
15276
15277 int NewOpcode =
15278 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15279 assert(NewOpcode != -1 &&
15280 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15281 "failed to find equivalent MIMG op");
15282
15283 // Adjust the writemask in the node
15285 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15286 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15287 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15288
15289 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15290
15291 MVT ResultVT = NewChannels == 1
15292 ? SVT
15293 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15294 : NewChannels == 5 ? 8
15295 : NewChannels);
15296 SDVTList NewVTList =
15297 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15298
15299 MachineSDNode *NewNode =
15300 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15301
15302 if (HasChain) {
15303 // Update chain.
15304 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15305 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15306 }
15307
15308 if (NewChannels == 1) {
15309 assert(Node->hasNUsesOfValue(1, 0));
15310 SDNode *Copy =
15311 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15312 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15313 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15314 return nullptr;
15315 }
15316
15317 // Update the users of the node with the new indices
15318 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15319 SDNode *User = Users[i];
15320 if (!User) {
15321 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15322 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15323 if (i || !NoChannels)
15324 continue;
15325 } else {
15326 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15327 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15328 if (NewUser != User) {
15329 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15330 DAG.RemoveDeadNode(User);
15331 }
15332 }
15333
15334 switch (Idx) {
15335 default:
15336 break;
15337 case AMDGPU::sub0:
15338 Idx = AMDGPU::sub1;
15339 break;
15340 case AMDGPU::sub1:
15341 Idx = AMDGPU::sub2;
15342 break;
15343 case AMDGPU::sub2:
15344 Idx = AMDGPU::sub3;
15345 break;
15346 case AMDGPU::sub3:
15347 Idx = AMDGPU::sub4;
15348 break;
15349 }
15350 }
15351
15352 DAG.RemoveDeadNode(Node);
15353 return nullptr;
15354}
15355
15357 if (Op.getOpcode() == ISD::AssertZext)
15358 Op = Op.getOperand(0);
15359
15360 return isa<FrameIndexSDNode>(Op);
15361}
15362
15363/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15364/// with frame index operands.
15365/// LLVM assumes that inputs are to these instructions are registers.
15366SDNode *
15368 SelectionDAG &DAG) const {
15369 if (Node->getOpcode() == ISD::CopyToReg) {
15370 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15371 SDValue SrcVal = Node->getOperand(2);
15372
15373 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15374 // to try understanding copies to physical registers.
15375 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15376 SDLoc SL(Node);
15378 SDValue VReg = DAG.getRegister(
15379 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15380
15381 SDNode *Glued = Node->getGluedNode();
15382 SDValue ToVReg = DAG.getCopyToReg(
15383 Node->getOperand(0), SL, VReg, SrcVal,
15384 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15385 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15386 VReg, ToVReg.getValue(1));
15387 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15388 DAG.RemoveDeadNode(Node);
15389 return ToResultReg.getNode();
15390 }
15391 }
15392
15394 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15395 if (!isFrameIndexOp(Node->getOperand(i))) {
15396 Ops.push_back(Node->getOperand(i));
15397 continue;
15398 }
15399
15400 SDLoc DL(Node);
15401 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15402 Node->getOperand(i).getValueType(),
15403 Node->getOperand(i)),
15404 0));
15405 }
15406
15407 return DAG.UpdateNodeOperands(Node, Ops);
15408}
15409
15410/// Fold the instructions after selecting them.
15411/// Returns null if users were already updated.
15413 SelectionDAG &DAG) const {
15415 unsigned Opcode = Node->getMachineOpcode();
15416
15417 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15418 !TII->isGather4(Opcode) &&
15419 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15420 return adjustWritemask(Node, DAG);
15421 }
15422
15423 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15425 return Node;
15426 }
15427
15428 switch (Opcode) {
15429 case AMDGPU::V_DIV_SCALE_F32_e64:
15430 case AMDGPU::V_DIV_SCALE_F64_e64: {
15431 // Satisfy the operand register constraint when one of the inputs is
15432 // undefined. Ordinarily each undef value will have its own implicit_def of
15433 // a vreg, so force these to use a single register.
15434 SDValue Src0 = Node->getOperand(1);
15435 SDValue Src1 = Node->getOperand(3);
15436 SDValue Src2 = Node->getOperand(5);
15437
15438 if ((Src0.isMachineOpcode() &&
15439 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15440 (Src0 == Src1 || Src0 == Src2))
15441 break;
15442
15443 MVT VT = Src0.getValueType().getSimpleVT();
15444 const TargetRegisterClass *RC =
15445 getRegClassFor(VT, Src0.getNode()->isDivergent());
15446
15448 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15449
15450 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15451 Src0, SDValue());
15452
15453 // src0 must be the same register as src1 or src2, even if the value is
15454 // undefined, so make sure we don't violate this constraint.
15455 if (Src0.isMachineOpcode() &&
15456 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15457 if (Src1.isMachineOpcode() &&
15458 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15459 Src0 = Src1;
15460 else if (Src2.isMachineOpcode() &&
15461 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15462 Src0 = Src2;
15463 else {
15464 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15465 Src0 = UndefReg;
15466 Src1 = UndefReg;
15467 }
15468 } else
15469 break;
15470
15471 SmallVector<SDValue, 9> Ops(Node->ops());
15472 Ops[1] = Src0;
15473 Ops[3] = Src1;
15474 Ops[5] = Src2;
15475 Ops.push_back(ImpDef.getValue(1));
15476 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15477 }
15478 default:
15479 break;
15480 }
15481
15482 return Node;
15483}
15484
15485// Any MIMG instructions that use tfe or lwe require an initialization of the
15486// result register that will be written in the case of a memory access failure.
15487// The required code is also added to tie this init code to the result of the
15488// img instruction.
15491 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15492 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15493 MachineBasicBlock &MBB = *MI.getParent();
15494
15495 int DstIdx =
15496 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15497 unsigned InitIdx = 0;
15498
15499 if (TII->isImage(MI)) {
15500 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15501 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15502 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15503
15504 if (!TFE && !LWE) // intersect_ray
15505 return;
15506
15507 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15508 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15509 unsigned D16Val = D16 ? D16->getImm() : 0;
15510
15511 if (!TFEVal && !LWEVal)
15512 return;
15513
15514 // At least one of TFE or LWE are non-zero
15515 // We have to insert a suitable initialization of the result value and
15516 // tie this to the dest of the image instruction.
15517
15518 // Calculate which dword we have to initialize to 0.
15519 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15520
15521 // check that dmask operand is found.
15522 assert(MO_Dmask && "Expected dmask operand in instruction");
15523
15524 unsigned dmask = MO_Dmask->getImm();
15525 // Determine the number of active lanes taking into account the
15526 // Gather4 special case
15527 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15528
15529 bool Packed = !Subtarget->hasUnpackedD16VMem();
15530
15531 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15532
15533 // Abandon attempt if the dst size isn't large enough
15534 // - this is in fact an error but this is picked up elsewhere and
15535 // reported correctly.
15536 uint32_t DstSize =
15537 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15538 if (DstSize < InitIdx)
15539 return;
15540 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15541 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15542 } else {
15543 return;
15544 }
15545
15546 const DebugLoc &DL = MI.getDebugLoc();
15547
15548 // Create a register for the initialization value.
15549 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15550 unsigned NewDst = 0; // Final initialized value will be in here
15551
15552 // If PRTStrictNull feature is enabled (the default) then initialize
15553 // all the result registers to 0, otherwise just the error indication
15554 // register (VGPRn+1)
15555 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15556 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15557
15558 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15559 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15560 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15561 // Initialize dword
15562 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15563 // clang-format off
15564 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15565 .addImm(0);
15566 // clang-format on
15567 // Insert into the super-reg
15568 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15569 .addReg(PrevDst)
15570 .addReg(SubReg)
15572
15573 PrevDst = NewDst;
15574 }
15575
15576 // Add as an implicit operand
15577 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15578
15579 // Tie the just added implicit operand to the dst
15580 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15581}
15582
15583/// Assign the register class depending on the number of
15584/// bits set in the writemask
15586 SDNode *Node) const {
15588
15589 MachineFunction *MF = MI.getParent()->getParent();
15592
15593 if (TII->isVOP3(MI.getOpcode())) {
15594 // Make sure constant bus requirements are respected.
15595 TII->legalizeOperandsVOP3(MRI, MI);
15596
15597 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15598 // This saves a chain-copy of registers and better balance register
15599 // use between vgpr and agpr as agpr tuples tend to be big.
15600 if (!MI.getDesc().operands().empty()) {
15601 unsigned Opc = MI.getOpcode();
15602 bool HasAGPRs = Info->mayNeedAGPRs();
15603 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15604 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15605 for (auto I :
15606 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15607 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15608 if (I == -1)
15609 break;
15610 if ((I == Src2Idx) && (HasAGPRs))
15611 break;
15612 MachineOperand &Op = MI.getOperand(I);
15613 if (!Op.isReg() || !Op.getReg().isVirtual())
15614 continue;
15615 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15616 if (!TRI->hasAGPRs(RC))
15617 continue;
15618 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15619 if (!Src || !Src->isCopy() ||
15620 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15621 continue;
15622 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15623 // All uses of agpr64 and agpr32 can also accept vgpr except for
15624 // v_accvgpr_read, but we do not produce agpr reads during selection,
15625 // so no use checks are needed.
15626 MRI.setRegClass(Op.getReg(), NewRC);
15627 }
15628
15629 if (TII->isMAI(MI)) {
15630 // The ordinary src0, src1, src2 were legalized above.
15631 //
15632 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15633 // as a separate instruction.
15634 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15635 AMDGPU::OpName::scale_src0);
15636 if (Src0Idx != -1) {
15637 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15638 AMDGPU::OpName::scale_src1);
15639 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15640 TII->usesConstantBus(MRI, MI, Src1Idx))
15641 TII->legalizeOpWithMove(MI, Src1Idx);
15642 }
15643 }
15644
15645 if (!HasAGPRs)
15646 return;
15647
15648 // Resolve the rest of AV operands to AGPRs.
15649 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15650 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15651 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15652 if (TRI->isVectorSuperClass(RC)) {
15653 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15654 MRI.setRegClass(Src2->getReg(), NewRC);
15655 if (Src2->isTied())
15656 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15657 }
15658 }
15659 }
15660 }
15661
15662 return;
15663 }
15664
15665 if (TII->isImage(MI))
15666 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15667}
15668
15670 uint64_t Val) {
15671 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15672 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15673}
15674
15676 const SDLoc &DL,
15677 SDValue Ptr) const {
15679
15680 // Build the half of the subregister with the constants before building the
15681 // full 128-bit register. If we are building multiple resource descriptors,
15682 // this will allow CSEing of the 2-component register.
15683 const SDValue Ops0[] = {
15684 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15685 buildSMovImm32(DAG, DL, 0),
15686 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15687 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15688 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15689
15690 SDValue SubRegHi = SDValue(
15691 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15692
15693 // Combine the constants and the pointer.
15694 const SDValue Ops1[] = {
15695 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15696 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15697 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15698
15699 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15700}
15701
15702/// Return a resource descriptor with the 'Add TID' bit enabled
15703/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15704/// of the resource descriptor) to create an offset, which is added to
15705/// the resource pointer.
15707 SDValue Ptr, uint32_t RsrcDword1,
15708 uint64_t RsrcDword2And3) const {
15709 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15710 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15711 if (RsrcDword1) {
15712 PtrHi =
15713 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15714 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15715 0);
15716 }
15717
15718 SDValue DataLo =
15719 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15720 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15721
15722 const SDValue Ops[] = {
15723 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15724 PtrLo,
15725 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15726 PtrHi,
15727 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15728 DataLo,
15729 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15730 DataHi,
15731 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15732
15733 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15734}
15735
15736//===----------------------------------------------------------------------===//
15737// SI Inline Assembly Support
15738//===----------------------------------------------------------------------===//
15739
15740std::pair<unsigned, const TargetRegisterClass *>
15742 StringRef Constraint,
15743 MVT VT) const {
15744 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15745
15746 const TargetRegisterClass *RC = nullptr;
15747 if (Constraint.size() == 1) {
15748 const unsigned BitWidth = VT.getSizeInBits();
15749 switch (Constraint[0]) {
15750 default:
15751 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15752 case 's':
15753 case 'r':
15754 switch (BitWidth) {
15755 case 16:
15756 RC = &AMDGPU::SReg_32RegClass;
15757 break;
15758 case 64:
15759 RC = &AMDGPU::SGPR_64RegClass;
15760 break;
15761 default:
15763 if (!RC)
15764 return std::pair(0U, nullptr);
15765 break;
15766 }
15767 break;
15768 case 'v':
15769 switch (BitWidth) {
15770 case 16:
15771 RC = &AMDGPU::VGPR_32RegClass;
15772 break;
15773 default:
15774 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15775 if (!RC)
15776 return std::pair(0U, nullptr);
15777 break;
15778 }
15779 break;
15780 case 'a':
15781 if (!Subtarget->hasMAIInsts())
15782 break;
15783 switch (BitWidth) {
15784 case 16:
15785 RC = &AMDGPU::AGPR_32RegClass;
15786 break;
15787 default:
15788 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15789 if (!RC)
15790 return std::pair(0U, nullptr);
15791 break;
15792 }
15793 break;
15794 }
15795 // We actually support i128, i16 and f16 as inline parameters
15796 // even if they are not reported as legal
15797 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15798 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15799 return std::pair(0U, RC);
15800 }
15801
15802 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15803 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15804 if (RegName.consume_front("v")) {
15805 RC = &AMDGPU::VGPR_32RegClass;
15806 } else if (RegName.consume_front("s")) {
15807 RC = &AMDGPU::SGPR_32RegClass;
15808 } else if (RegName.consume_front("a")) {
15809 RC = &AMDGPU::AGPR_32RegClass;
15810 }
15811
15812 if (RC) {
15813 uint32_t Idx;
15814 if (RegName.consume_front("[")) {
15815 uint32_t End;
15816 bool Failed = RegName.consumeInteger(10, Idx);
15817 Failed |= !RegName.consume_front(":");
15818 Failed |= RegName.consumeInteger(10, End);
15819 Failed |= !RegName.consume_back("]");
15820 if (!Failed) {
15821 uint32_t Width = (End - Idx + 1) * 32;
15822 // Prohibit constraints for register ranges with a width that does not
15823 // match the required type.
15824 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15825 return std::pair(0U, nullptr);
15826 MCRegister Reg = RC->getRegister(Idx);
15828 RC = TRI->getVGPRClassForBitWidth(Width);
15829 else if (SIRegisterInfo::isSGPRClass(RC))
15830 RC = TRI->getSGPRClassForBitWidth(Width);
15831 else if (SIRegisterInfo::isAGPRClass(RC))
15832 RC = TRI->getAGPRClassForBitWidth(Width);
15833 if (RC) {
15834 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15835 return std::pair(Reg, RC);
15836 }
15837 }
15838 } else {
15839 // Check for lossy scalar/vector conversions.
15840 if (VT.isVector() && VT.getSizeInBits() != 32)
15841 return std::pair(0U, nullptr);
15842 bool Failed = RegName.getAsInteger(10, Idx);
15843 if (!Failed && Idx < RC->getNumRegs())
15844 return std::pair(RC->getRegister(Idx), RC);
15845 }
15846 }
15847 }
15848
15849 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15850 if (Ret.first)
15851 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15852
15853 return Ret;
15854}
15855
15856static bool isImmConstraint(StringRef Constraint) {
15857 if (Constraint.size() == 1) {
15858 switch (Constraint[0]) {
15859 default:
15860 break;
15861 case 'I':
15862 case 'J':
15863 case 'A':
15864 case 'B':
15865 case 'C':
15866 return true;
15867 }
15868 } else if (Constraint == "DA" || Constraint == "DB") {
15869 return true;
15870 }
15871 return false;
15872}
15873
15876 if (Constraint.size() == 1) {
15877 switch (Constraint[0]) {
15878 default:
15879 break;
15880 case 's':
15881 case 'v':
15882 case 'a':
15883 return C_RegisterClass;
15884 }
15885 }
15886 if (isImmConstraint(Constraint)) {
15887 return C_Other;
15888 }
15889 return TargetLowering::getConstraintType(Constraint);
15890}
15891
15892static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15894 Val = Val & maskTrailingOnes<uint64_t>(Size);
15895 }
15896 return Val;
15897}
15898
15900 StringRef Constraint,
15901 std::vector<SDValue> &Ops,
15902 SelectionDAG &DAG) const {
15903 if (isImmConstraint(Constraint)) {
15904 uint64_t Val;
15905 if (getAsmOperandConstVal(Op, Val) &&
15906 checkAsmConstraintVal(Op, Constraint, Val)) {
15907 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15908 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15909 }
15910 } else {
15911 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15912 }
15913}
15914
15916 unsigned Size = Op.getScalarValueSizeInBits();
15917 if (Size > 64)
15918 return false;
15919
15920 if (Size == 16 && !Subtarget->has16BitInsts())
15921 return false;
15922
15923 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15924 Val = C->getSExtValue();
15925 return true;
15926 }
15927 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15928 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15929 return true;
15930 }
15931 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15932 if (Size != 16 || Op.getNumOperands() != 2)
15933 return false;
15934 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15935 return false;
15936 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15937 Val = C->getSExtValue();
15938 return true;
15939 }
15940 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15941 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15942 return true;
15943 }
15944 }
15945
15946 return false;
15947}
15948
15950 uint64_t Val) const {
15951 if (Constraint.size() == 1) {
15952 switch (Constraint[0]) {
15953 case 'I':
15955 case 'J':
15956 return isInt<16>(Val);
15957 case 'A':
15958 return checkAsmConstraintValA(Op, Val);
15959 case 'B':
15960 return isInt<32>(Val);
15961 case 'C':
15962 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15964 default:
15965 break;
15966 }
15967 } else if (Constraint.size() == 2) {
15968 if (Constraint == "DA") {
15969 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15970 int64_t LoBits = static_cast<int32_t>(Val);
15971 return checkAsmConstraintValA(Op, HiBits, 32) &&
15972 checkAsmConstraintValA(Op, LoBits, 32);
15973 }
15974 if (Constraint == "DB") {
15975 return true;
15976 }
15977 }
15978 llvm_unreachable("Invalid asm constraint");
15979}
15980
15982 unsigned MaxSize) const {
15983 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15984 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15985 if (Size == 16) {
15986 MVT VT = Op.getSimpleValueType();
15987 switch (VT.SimpleTy) {
15988 default:
15989 return false;
15990 case MVT::i16:
15991 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15992 case MVT::f16:
15993 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15994 case MVT::bf16:
15995 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15996 case MVT::v2i16:
15997 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15998 case MVT::v2f16:
15999 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16000 case MVT::v2bf16:
16001 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16002 }
16003 }
16004 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16005 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16006 return true;
16007 return false;
16008}
16009
16010static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16011 switch (UnalignedClassID) {
16012 case AMDGPU::VReg_64RegClassID:
16013 return AMDGPU::VReg_64_Align2RegClassID;
16014 case AMDGPU::VReg_96RegClassID:
16015 return AMDGPU::VReg_96_Align2RegClassID;
16016 case AMDGPU::VReg_128RegClassID:
16017 return AMDGPU::VReg_128_Align2RegClassID;
16018 case AMDGPU::VReg_160RegClassID:
16019 return AMDGPU::VReg_160_Align2RegClassID;
16020 case AMDGPU::VReg_192RegClassID:
16021 return AMDGPU::VReg_192_Align2RegClassID;
16022 case AMDGPU::VReg_224RegClassID:
16023 return AMDGPU::VReg_224_Align2RegClassID;
16024 case AMDGPU::VReg_256RegClassID:
16025 return AMDGPU::VReg_256_Align2RegClassID;
16026 case AMDGPU::VReg_288RegClassID:
16027 return AMDGPU::VReg_288_Align2RegClassID;
16028 case AMDGPU::VReg_320RegClassID:
16029 return AMDGPU::VReg_320_Align2RegClassID;
16030 case AMDGPU::VReg_352RegClassID:
16031 return AMDGPU::VReg_352_Align2RegClassID;
16032 case AMDGPU::VReg_384RegClassID:
16033 return AMDGPU::VReg_384_Align2RegClassID;
16034 case AMDGPU::VReg_512RegClassID:
16035 return AMDGPU::VReg_512_Align2RegClassID;
16036 case AMDGPU::VReg_1024RegClassID:
16037 return AMDGPU::VReg_1024_Align2RegClassID;
16038 case AMDGPU::AReg_64RegClassID:
16039 return AMDGPU::AReg_64_Align2RegClassID;
16040 case AMDGPU::AReg_96RegClassID:
16041 return AMDGPU::AReg_96_Align2RegClassID;
16042 case AMDGPU::AReg_128RegClassID:
16043 return AMDGPU::AReg_128_Align2RegClassID;
16044 case AMDGPU::AReg_160RegClassID:
16045 return AMDGPU::AReg_160_Align2RegClassID;
16046 case AMDGPU::AReg_192RegClassID:
16047 return AMDGPU::AReg_192_Align2RegClassID;
16048 case AMDGPU::AReg_256RegClassID:
16049 return AMDGPU::AReg_256_Align2RegClassID;
16050 case AMDGPU::AReg_512RegClassID:
16051 return AMDGPU::AReg_512_Align2RegClassID;
16052 case AMDGPU::AReg_1024RegClassID:
16053 return AMDGPU::AReg_1024_Align2RegClassID;
16054 default:
16055 return -1;
16056 }
16057}
16058
16059// Figure out which registers should be reserved for stack access. Only after
16060// the function is legalized do we know all of the non-spill stack objects or if
16061// calls are present.
16065 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16066 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16067 const SIInstrInfo *TII = ST.getInstrInfo();
16068
16069 if (Info->isEntryFunction()) {
16070 // Callable functions have fixed registers used for stack access.
16072 }
16073
16074 // TODO: Move this logic to getReservedRegs()
16075 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16076 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16077 Register SReg = ST.isWave32()
16078 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16079 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16080 &AMDGPU::SGPR_64RegClass);
16081 Info->setSGPRForEXECCopy(SReg);
16082
16083 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16084 Info->getStackPtrOffsetReg()));
16085 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16086 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16087
16088 // We need to worry about replacing the default register with itself in case
16089 // of MIR testcases missing the MFI.
16090 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16091 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16092
16093 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16094 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16095
16096 Info->limitOccupancy(MF);
16097
16098 if (ST.isWave32() && !MF.empty()) {
16099 for (auto &MBB : MF) {
16100 for (auto &MI : MBB) {
16101 TII->fixImplicitOperands(MI);
16102 }
16103 }
16104 }
16105
16106 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16107 // classes if required. Ideally the register class constraints would differ
16108 // per-subtarget, but there's no easy way to achieve that right now. This is
16109 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16110 // from using them as the register class for legal types.
16111 if (ST.needsAlignedVGPRs()) {
16112 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16113 const Register Reg = Register::index2VirtReg(I);
16114 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16115 if (!RC)
16116 continue;
16117 int NewClassID = getAlignedAGPRClassID(RC->getID());
16118 if (NewClassID != -1)
16119 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16120 }
16121 }
16122
16124}
16125
16127 KnownBits &Known,
16128 const APInt &DemandedElts,
16129 const SelectionDAG &DAG,
16130 unsigned Depth) const {
16131 Known.resetAll();
16132 unsigned Opc = Op.getOpcode();
16133 switch (Opc) {
16135 unsigned IID = Op.getConstantOperandVal(0);
16136 switch (IID) {
16137 case Intrinsic::amdgcn_mbcnt_lo:
16138 case Intrinsic::amdgcn_mbcnt_hi: {
16139 const GCNSubtarget &ST =
16141 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16142 // most 31 + src1.
16143 Known.Zero.setBitsFrom(
16144 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16145 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16146 Known = KnownBits::add(Known, Known2);
16147 return;
16148 }
16149 }
16150 break;
16151 }
16152 }
16154 Op, Known, DemandedElts, DAG, Depth);
16155}
16156
16158 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16160
16161 // Set the high bits to zero based on the maximum allowed scratch size per
16162 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16163 // calculation won't overflow, so assume the sign bit is never set.
16164 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16165}
16166
16168 KnownBits &Known, unsigned Dim) {
16169 unsigned MaxValue =
16170 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16171 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16172}
16173
16175 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16176 const MachineRegisterInfo &MRI, unsigned Depth) const {
16177 const MachineInstr *MI = MRI.getVRegDef(R);
16178 switch (MI->getOpcode()) {
16179 case AMDGPU::G_INTRINSIC:
16180 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16181 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16182 switch (IID) {
16183 case Intrinsic::amdgcn_workitem_id_x:
16184 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16185 break;
16186 case Intrinsic::amdgcn_workitem_id_y:
16187 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16188 break;
16189 case Intrinsic::amdgcn_workitem_id_z:
16190 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16191 break;
16192 case Intrinsic::amdgcn_mbcnt_lo:
16193 case Intrinsic::amdgcn_mbcnt_hi: {
16194 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16195 // most 31 + src1.
16196 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16197 ? getSubtarget()->getWavefrontSizeLog2()
16198 : 5);
16199 KnownBits Known2;
16200 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16201 Depth + 1);
16202 Known = KnownBits::add(Known, Known2);
16203 break;
16204 }
16205 case Intrinsic::amdgcn_groupstaticsize: {
16206 // We can report everything over the maximum size as 0. We can't report
16207 // based on the actual size because we don't know if it's accurate or not
16208 // at any given point.
16209 Known.Zero.setHighBits(
16210 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16211 break;
16212 }
16213 }
16214 break;
16215 }
16216 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16217 Known.Zero.setHighBits(24);
16218 break;
16219 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16220 Known.Zero.setHighBits(16);
16221 break;
16222 case AMDGPU::G_AMDGPU_SMED3:
16223 case AMDGPU::G_AMDGPU_UMED3: {
16224 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16225
16226 KnownBits Known2;
16227 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16228 if (Known2.isUnknown())
16229 break;
16230
16231 KnownBits Known1;
16232 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16233 if (Known1.isUnknown())
16234 break;
16235
16236 KnownBits Known0;
16237 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16238 if (Known0.isUnknown())
16239 break;
16240
16241 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16242 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16243 Known.One = Known0.One & Known1.One & Known2.One;
16244 break;
16245 }
16246 }
16247}
16248
16251 unsigned Depth) const {
16252 const MachineInstr *MI = MRI.getVRegDef(R);
16253 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16254 // FIXME: Can this move to generic code? What about the case where the call
16255 // site specifies a lower alignment?
16256 Intrinsic::ID IID = GI->getIntrinsicID();
16258 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16259 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16260 return *RetAlign;
16261 }
16262 return Align(1);
16263}
16264
16267 const Align CacheLineAlign = Align(64);
16268
16269 // Pre-GFX10 target did not benefit from loop alignment
16270 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16271 getSubtarget()->hasInstFwdPrefetchBug())
16272 return PrefAlign;
16273
16274 // On GFX10 I$ is 4 x 64 bytes cache lines.
16275 // By default prefetcher keeps one cache line behind and reads two ahead.
16276 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16277 // behind and one ahead.
16278 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16279 // If loop fits 64 bytes it always spans no more than two cache lines and
16280 // does not need an alignment.
16281 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16282 // Else if loop is less or equal 192 bytes we need two lines behind.
16283
16285 const MachineBasicBlock *Header = ML->getHeader();
16286 if (Header->getAlignment() != PrefAlign)
16287 return Header->getAlignment(); // Already processed.
16288
16289 unsigned LoopSize = 0;
16290 for (const MachineBasicBlock *MBB : ML->blocks()) {
16291 // If inner loop block is aligned assume in average half of the alignment
16292 // size to be added as nops.
16293 if (MBB != Header)
16294 LoopSize += MBB->getAlignment().value() / 2;
16295
16296 for (const MachineInstr &MI : *MBB) {
16297 LoopSize += TII->getInstSizeInBytes(MI);
16298 if (LoopSize > 192)
16299 return PrefAlign;
16300 }
16301 }
16302
16303 if (LoopSize <= 64)
16304 return PrefAlign;
16305
16306 if (LoopSize <= 128)
16307 return CacheLineAlign;
16308
16309 // If any of parent loops is surrounded by prefetch instructions do not
16310 // insert new for inner loop, which would reset parent's settings.
16311 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16312 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16313 auto I = Exit->getFirstNonDebugInstr();
16314 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16315 return CacheLineAlign;
16316 }
16317 }
16318
16319 MachineBasicBlock *Pre = ML->getLoopPreheader();
16320 MachineBasicBlock *Exit = ML->getExitBlock();
16321
16322 if (Pre && Exit) {
16323 auto PreTerm = Pre->getFirstTerminator();
16324 if (PreTerm == Pre->begin() ||
16325 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16326 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16327 .addImm(1); // prefetch 2 lines behind PC
16328
16329 auto ExitHead = Exit->getFirstNonDebugInstr();
16330 if (ExitHead == Exit->end() ||
16331 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16332 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16333 .addImm(2); // prefetch 1 line behind PC
16334 }
16335
16336 return CacheLineAlign;
16337}
16338
16340static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16341 assert(N->getOpcode() == ISD::CopyFromReg);
16342 do {
16343 // Follow the chain until we find an INLINEASM node.
16344 N = N->getOperand(0).getNode();
16345 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16346 return true;
16347 } while (N->getOpcode() == ISD::CopyFromReg);
16348 return false;
16349}
16350
16353 UniformityInfo *UA) const {
16354 switch (N->getOpcode()) {
16355 case ISD::CopyFromReg: {
16356 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16357 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16358 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16359 Register Reg = R->getReg();
16360
16361 // FIXME: Why does this need to consider isLiveIn?
16362 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16363 return !TRI->isSGPRReg(MRI, Reg);
16364
16365 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16366 return UA->isDivergent(V);
16367
16368 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16369 return !TRI->isSGPRReg(MRI, Reg);
16370 }
16371 case ISD::LOAD: {
16372 const LoadSDNode *L = cast<LoadSDNode>(N);
16373 unsigned AS = L->getAddressSpace();
16374 // A flat load may access private memory.
16376 }
16377 case ISD::CALLSEQ_END:
16378 return true;
16380 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16382 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16401 // Target-specific read-modify-write atomics are sources of divergence.
16402 return true;
16403 default:
16404 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16405 // Generic read-modify-write atomics are sources of divergence.
16406 return A->readMem() && A->writeMem();
16407 }
16408 return false;
16409 }
16410}
16411
16413 EVT VT) const {
16414 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16415 case MVT::f32:
16417 case MVT::f64:
16418 case MVT::f16:
16420 default:
16421 return false;
16422 }
16423}
16424
16426 LLT Ty, const MachineFunction &MF) const {
16427 switch (Ty.getScalarSizeInBits()) {
16428 case 32:
16429 return !denormalModeIsFlushAllF32(MF);
16430 case 64:
16431 case 16:
16432 return !denormalModeIsFlushAllF64F16(MF);
16433 default:
16434 return false;
16435 }
16436}
16437
16439 const SelectionDAG &DAG,
16440 bool SNaN,
16441 unsigned Depth) const {
16442 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16443 const MachineFunction &MF = DAG.getMachineFunction();
16445
16446 if (Info->getMode().DX10Clamp)
16447 return true; // Clamped to 0.
16448 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16449 }
16450
16452 Depth);
16453}
16454
16455// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16456// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16458 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16459 return true;
16460
16462 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16463 if (DenormMode == DenormalMode::getPreserveSign())
16464 return true;
16465
16466 // TODO: Remove this.
16467 return RMW->getFunction()
16468 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16469 .getValueAsBool();
16470}
16471
16473 LLVMContext &Ctx = RMW->getContext();
16474 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16475 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16476
16477 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16478 << "Hardware instruction generated for atomic "
16479 << RMW->getOperationName(RMW->getOperation())
16480 << " operation at memory scope " << MemScope;
16481}
16482
16483static bool isV2F16OrV2BF16(Type *Ty) {
16484 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16485 Type *EltTy = VT->getElementType();
16486 return VT->getNumElements() == 2 &&
16487 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16488 }
16489
16490 return false;
16491}
16492
16493static bool isV2F16(Type *Ty) {
16494 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16495 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16496}
16497
16498static bool isV2BF16(Type *Ty) {
16499 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16500 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16501}
16502
16503/// \return true if atomicrmw integer ops work for the type.
16504static bool isAtomicRMWLegalIntTy(Type *Ty) {
16505 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16506 unsigned BW = IT->getBitWidth();
16507 return BW == 32 || BW == 64;
16508 }
16509
16510 return false;
16511}
16512
16513/// \return true if this atomicrmw xchg type can be selected.
16514static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16515 Type *Ty = RMW->getType();
16516 if (isAtomicRMWLegalIntTy(Ty))
16517 return true;
16518
16519 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16520 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16521 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16522 return BW == 32 || BW == 64;
16523 }
16524
16525 if (Ty->isFloatTy() || Ty->isDoubleTy())
16526 return true;
16527
16528 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16529 return VT->getNumElements() == 2 &&
16530 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16531 }
16532
16533 return false;
16534}
16535
16536/// \returns true if it's valid to emit a native instruction for \p RMW, based
16537/// on the properties of the target memory.
16538static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16539 const AtomicRMWInst *RMW,
16540 bool HasSystemScope) {
16541 // The remote/fine-grained access logic is different from the integer
16542 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16543 // fine-grained access does not work, even for a device local allocation.
16544 //
16545 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16546 // allocations work.
16547 if (HasSystemScope) {
16549 RMW->hasMetadata("amdgpu.no.remote.memory"))
16550 return true;
16552 return true;
16553
16554 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16555}
16556
16557/// \return Action to perform on AtomicRMWInsts for integer operations.
16560 return isAtomicRMWLegalIntTy(RMW->getType())
16563}
16564
16565/// Return if a flat address space atomicrmw can access private memory.
16567 const MDNode *NoaliasAddrSpaceMD =
16568 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16569 if (!NoaliasAddrSpaceMD)
16570 return true;
16571
16572 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16573 ++I) {
16574 auto *Low = mdconst::extract<ConstantInt>(
16575 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16576 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16577 auto *High = mdconst::extract<ConstantInt>(
16578 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16579 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16580 }
16581 }
16582
16583 return true;
16584}
16585
16588 unsigned AS = RMW->getPointerAddressSpace();
16589 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16591
16592 // 64-bit flat atomics that dynamically reside in private memory will silently
16593 // be dropped.
16594 //
16595 // Note that we will emit a new copy of the original atomic in the expansion,
16596 // which will be incrementally relegalized.
16597 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16598 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16599 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16602
16603 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16605 ORE.emit([=]() {
16606 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16607 });
16608 return Kind;
16609 };
16610
16611 auto SSID = RMW->getSyncScopeID();
16612 bool HasSystemScope =
16613 SSID == SyncScope::System ||
16614 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16615
16616 auto Op = RMW->getOperation();
16617 switch (Op) {
16618 case AtomicRMWInst::Xchg: {
16619 // PCIe supports add and xchg for system atomics.
16620 return isAtomicRMWLegalXChgTy(RMW)
16623 }
16624 case AtomicRMWInst::Add:
16625 case AtomicRMWInst::And:
16629 case AtomicRMWInst::Sub:
16630 case AtomicRMWInst::Or:
16631 case AtomicRMWInst::Xor: {
16632 // Atomic sub/or/xor do not work over PCI express, but atomic add
16633 // does. InstCombine transforms these with 0 to or, so undo that.
16634 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16635 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16636 ConstVal && ConstVal->isNullValue())
16638 }
16639
16641 }
16642 case AtomicRMWInst::FAdd: {
16643 Type *Ty = RMW->getType();
16644
16645 // TODO: Handle REGION_ADDRESS
16646 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16647 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16648 // is fixed to round-to-nearest-even.
16649 //
16650 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16651 // round-to-nearest-even.
16652 //
16653 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16654 // suggests it is OK if the floating-point mode may not match the calling
16655 // thread.
16656 if (Ty->isFloatTy()) {
16659 }
16660
16661 if (Ty->isDoubleTy()) {
16662 // Ignores denormal mode, but we don't consider flushing mandatory.
16665 }
16666
16667 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16669
16671 }
16672
16673 // LDS atomics respect the denormal mode from the mode register.
16674 //
16675 // Traditionally f32 global/buffer memory atomics would unconditionally
16676 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16677 // flush.
16678 //
16679 // On targets with flat atomic fadd, denormals would flush depending on
16680 // whether the target address resides in LDS or global memory. We consider
16681 // this flat-maybe-flush as will-flush.
16682 if (Ty->isFloatTy() &&
16686
16687 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16688 // safe. The message phrasing also should be better.
16689 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16690 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16691 // gfx940, gfx12
16692 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16693 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16694 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16695 // gfx90a, gfx940, gfx12
16696 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16697 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16698
16699 // gfx940, gfx12
16700 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16701 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16702 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16703 // gfx90a, gfx940, gfx12
16704 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16705 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16706
16707 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16708 // buffer. gfx12 does have the buffer version.
16709 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16710 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16711 }
16712
16713 // global and flat atomic fadd f64: gfx90a, gfx940.
16714 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16715 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16716
16717 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16718 if (Ty->isFloatTy()) {
16719 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16720 // gfx11+.
16721 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16722 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16723 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16724 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16725 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16726 } else {
16727 // gfx908
16728 if (RMW->use_empty() &&
16730 isV2F16(Ty))
16731 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16732 }
16733 }
16734
16735 // flat atomic fadd f32: gfx940, gfx11+.
16736 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16737 if (Subtarget->hasFlatAtomicFaddF32Inst())
16738 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16739
16740 // If it is in flat address space, and the type is float, we will try to
16741 // expand it, if the target supports global and lds atomic fadd. The
16742 // reason we need that is, in the expansion, we emit the check of
16743 // address space. If it is in global address space, we emit the global
16744 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16745 // fadd.
16746 if (Subtarget->hasLDSFPAtomicAddF32()) {
16747 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16749 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16751 }
16752 }
16753 }
16754
16756 }
16758 case AtomicRMWInst::FMax: {
16759 Type *Ty = RMW->getType();
16760
16761 // LDS float and double fmin/fmax were always supported.
16762 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16763 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16765 }
16766
16767 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16768 // For flat and global cases:
16769 // float, double in gfx7. Manual claims denormal support.
16770 // Removed in gfx8.
16771 // float, double restored in gfx10.
16772 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16773 //
16774 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16775 // no f32.
16776 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16777 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16778 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16779 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16780 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16781 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16783 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16784 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16785 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16786 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16787 }
16788 }
16789
16791 }
16792 case AtomicRMWInst::Min:
16793 case AtomicRMWInst::Max:
16795 case AtomicRMWInst::UMax: {
16798 // Always expand system scope min/max atomics.
16799 if (HasSystemScope)
16801 }
16802
16804 }
16807 default:
16809 }
16810
16811 llvm_unreachable("covered atomicrmw op switch");
16812}
16813
16819}
16820
16823 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16826}
16827
16830 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16831 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16833
16834 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16836
16837 const DataLayout &DL = CmpX->getDataLayout();
16838
16839 Type *ValTy = CmpX->getNewValOperand()->getType();
16840
16841 // If a 64-bit flat atomic may alias private, we need to avoid using the
16842 // atomic in the private case.
16843 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16845}
16846
16847const TargetRegisterClass *
16848SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16850 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16851 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16852 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16853 : &AMDGPU::SReg_32RegClass;
16854 if (!TRI->isSGPRClass(RC) && !isDivergent)
16855 return TRI->getEquivalentSGPRClass(RC);
16856 if (TRI->isSGPRClass(RC) && isDivergent)
16857 return TRI->getEquivalentVGPRClass(RC);
16858
16859 return RC;
16860}
16861
16862// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16863// uniform values (as produced by the mask results of control flow intrinsics)
16864// used outside of divergent blocks. The phi users need to also be treated as
16865// always uniform.
16866//
16867// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16868static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16869 unsigned WaveSize) {
16870 // FIXME: We assume we never cast the mask results of a control flow
16871 // intrinsic.
16872 // Early exit if the type won't be consistent as a compile time hack.
16873 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16874 if (!IT || IT->getBitWidth() != WaveSize)
16875 return false;
16876
16877 if (!isa<Instruction>(V))
16878 return false;
16879 if (!Visited.insert(V).second)
16880 return false;
16881 bool Result = false;
16882 for (const auto *U : V->users()) {
16883 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16884 if (V == U->getOperand(1)) {
16885 switch (Intrinsic->getIntrinsicID()) {
16886 default:
16887 Result = false;
16888 break;
16889 case Intrinsic::amdgcn_if_break:
16890 case Intrinsic::amdgcn_if:
16891 case Intrinsic::amdgcn_else:
16892 Result = true;
16893 break;
16894 }
16895 }
16896 if (V == U->getOperand(0)) {
16897 switch (Intrinsic->getIntrinsicID()) {
16898 default:
16899 Result = false;
16900 break;
16901 case Intrinsic::amdgcn_end_cf:
16902 case Intrinsic::amdgcn_loop:
16903 Result = true;
16904 break;
16905 }
16906 }
16907 } else {
16908 Result = hasCFUser(U, Visited, WaveSize);
16909 }
16910 if (Result)
16911 break;
16912 }
16913 return Result;
16914}
16915
16917 const Value *V) const {
16918 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16919 if (CI->isInlineAsm()) {
16920 // FIXME: This cannot give a correct answer. This should only trigger in
16921 // the case where inline asm returns mixed SGPR and VGPR results, used
16922 // outside the defining block. We don't have a specific result to
16923 // consider, so this assumes if any value is SGPR, the overall register
16924 // also needs to be SGPR.
16925 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16927 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16928 for (auto &TC : TargetConstraints) {
16929 if (TC.Type == InlineAsm::isOutput) {
16931 const TargetRegisterClass *RC =
16932 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
16933 TC.ConstraintVT)
16934 .second;
16935 if (RC && SIRI->isSGPRClass(RC))
16936 return true;
16937 }
16938 }
16939 }
16940 }
16942 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16943}
16944
16946 for (SDUse &Use : N->uses()) {
16947 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
16948 if (getBasePtrIndex(M) == Use.getOperandNo())
16949 return true;
16950 }
16951 }
16952 return false;
16953}
16954
16956 SDValue N1) const {
16957 if (!N0.hasOneUse())
16958 return false;
16959 // Take care of the opportunity to keep N0 uniform
16960 if (N0->isDivergent() || !N1->isDivergent())
16961 return true;
16962 // Check if we have a good chance to form the memory access pattern with the
16963 // base and offset
16964 return (DAG.isBaseWithConstantOffset(N0) &&
16966}
16967
16969 Register N0, Register N1) const {
16970 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16971}
16972
16975 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16977 if (I.getMetadata("amdgpu.noclobber"))
16978 Flags |= MONoClobber;
16979 if (I.getMetadata("amdgpu.last.use"))
16980 Flags |= MOLastUse;
16981 return Flags;
16982}
16983
16985 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16986 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16987 if (User->getOpcode() != ISD::CopyToReg)
16988 return false;
16989 if (!Def->isMachineOpcode())
16990 return false;
16991 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16992 if (!MDef)
16993 return false;
16994
16995 unsigned ResNo = User->getOperand(Op).getResNo();
16996 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16997 return false;
16998 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16999 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17000 PhysReg = AMDGPU::SCC;
17001 const TargetRegisterClass *RC =
17002 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17003 Cost = RC->getCopyCost();
17004 return true;
17005 }
17006 return false;
17007}
17008
17009/// Check if it is profitable to hoist instruction in then/else to if.
17011 if (!I->hasOneUse())
17012 return true;
17013
17014 Instruction *User = I->user_back();
17015 // TODO: Add more patterns that are not profitable to hoist and
17016 // handle modifiers such as fabs and fneg
17017 switch (I->getOpcode()) {
17018 case Instruction::FMul: {
17019 if (User->getOpcode() != Instruction::FSub &&
17020 User->getOpcode() != Instruction::FAdd)
17021 return true;
17022
17024
17025 return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17026 Options.AllowFPOpFusion != FPOpFusion::Fast &&
17027 !Options.UnsafeFPMath) ||
17028 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17029 }
17030 default:
17031 return true;
17032 }
17033 return true;
17034}
17035
17037 Instruction *AI) const {
17038 // Given: atomicrmw fadd ptr %addr, float %val ordering
17039 //
17040 // With this expansion we produce the following code:
17041 // [...]
17042 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17043 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17044 //
17045 // atomicrmw.shared:
17046 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17047 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17048 // float %val ordering
17049 // br label %atomicrmw.phi
17050 //
17051 // atomicrmw.check.private:
17052 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17053 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17054 //
17055 // atomicrmw.private:
17056 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17057 // %loaded.private = load float, ptr addrspace(5) %cast.private
17058 // %val.new = fadd float %loaded.private, %val
17059 // store float %val.new, ptr addrspace(5) %cast.private
17060 // br label %atomicrmw.phi
17061 //
17062 // atomicrmw.global:
17063 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17064 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17065 // float %val ordering
17066 // br label %atomicrmw.phi
17067 //
17068 // atomicrmw.phi:
17069 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17070 // [ %loaded.private, %atomicrmw.private ],
17071 // [ %loaded.global, %atomicrmw.global ]
17072 // br label %atomicrmw.end
17073 //
17074 // atomicrmw.end:
17075 // [...]
17076 //
17077 //
17078 // For 64-bit atomics which may reside in private memory, we perform a simpler
17079 // version that only inserts the private check, and uses the flat operation.
17080
17081 IRBuilder<> Builder(AI);
17082 LLVMContext &Ctx = Builder.getContext();
17083
17084 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17085 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17087 Value *Addr = AI->getOperand(PtrOpIdx);
17088
17089 /// TODO: Only need to check private, then emit flat-known-not private (no
17090 /// need for shared block, or cast to global).
17091 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17092
17093 Align Alignment;
17094 if (RMW)
17095 Alignment = RMW->getAlign();
17096 else if (CX)
17097 Alignment = CX->getAlign();
17098 else
17099 llvm_unreachable("unhandled atomic operation");
17100
17101 // FullFlatEmulation is true if we need to issue the private, shared, and
17102 // global cases.
17103 //
17104 // If this is false, we are only dealing with the flat-targeting-private case,
17105 // where we only insert a check for private and still use the flat instruction
17106 // for global and shared.
17107
17108 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17109 Subtarget->hasAtomicFaddInsts() &&
17110 RMW->getType()->isFloatTy();
17111
17112 // If the return value isn't used, do not introduce a false use in the phi.
17113 bool ReturnValueIsUsed = !AI->use_empty();
17114
17115 BasicBlock *BB = Builder.GetInsertBlock();
17116 Function *F = BB->getParent();
17117 BasicBlock *ExitBB =
17118 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17119 BasicBlock *SharedBB = nullptr;
17120
17121 BasicBlock *CheckPrivateBB = BB;
17122 if (FullFlatEmulation) {
17123 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17124 CheckPrivateBB =
17125 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17126 }
17127
17128 BasicBlock *PrivateBB =
17129 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17130 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17131 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17132
17133 std::prev(BB->end())->eraseFromParent();
17134 Builder.SetInsertPoint(BB);
17135
17136 Value *LoadedShared = nullptr;
17137 if (FullFlatEmulation) {
17138 CallInst *IsShared = Builder.CreateIntrinsic(
17139 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17140 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17141 Builder.SetInsertPoint(SharedBB);
17142 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17144
17145 Instruction *Clone = AI->clone();
17146 Clone->insertInto(SharedBB, SharedBB->end());
17147 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17148 LoadedShared = Clone;
17149
17150 Builder.CreateBr(PhiBB);
17151 Builder.SetInsertPoint(CheckPrivateBB);
17152 }
17153
17154 CallInst *IsPrivate = Builder.CreateIntrinsic(
17155 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17156 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17157
17158 Builder.SetInsertPoint(PrivateBB);
17159
17160 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17162
17163 Value *LoadedPrivate;
17164 if (RMW) {
17165 LoadedPrivate = Builder.CreateAlignedLoad(
17166 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17167
17168 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17169 LoadedPrivate, RMW->getValOperand());
17170
17171 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17172 } else {
17173 auto [ResultLoad, Equal] =
17174 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17175 CX->getNewValOperand(), CX->getAlign());
17176
17177 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17178 ResultLoad, 0);
17179 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17180 }
17181
17182 Builder.CreateBr(PhiBB);
17183
17184 Builder.SetInsertPoint(GlobalBB);
17185
17186 // Continue using a flat instruction if we only emitted the check for private.
17187 Instruction *LoadedGlobal = AI;
17188 if (FullFlatEmulation) {
17189 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17191 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17192 }
17193
17194 AI->removeFromParent();
17195 AI->insertInto(GlobalBB, GlobalBB->end());
17196
17197 // The new atomicrmw may go through another round of legalization later.
17198 if (!FullFlatEmulation) {
17199 // We inserted the runtime check already, make sure we do not try to
17200 // re-expand this.
17201 // TODO: Should union with any existing metadata.
17202 MDBuilder MDB(F->getContext());
17203 MDNode *RangeNotPrivate =
17206 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17207 RangeNotPrivate);
17208 }
17209
17210 Builder.CreateBr(PhiBB);
17211
17212 Builder.SetInsertPoint(PhiBB);
17213
17214 if (ReturnValueIsUsed) {
17215 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17216 AI->replaceAllUsesWith(Loaded);
17217 if (FullFlatEmulation)
17218 Loaded->addIncoming(LoadedShared, SharedBB);
17219 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17220 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17221 Loaded->takeName(AI);
17222 }
17223
17224 Builder.CreateBr(ExitBB);
17225}
17226
17229
17232 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17233 ConstVal && ConstVal->isNullValue()) {
17234 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17236
17237 // We may still need the private-alias-flat handling below.
17238
17239 // TODO: Skip this for cases where we cannot access remote memory.
17240 }
17241 }
17242
17243 // The non-flat expansions should only perform the de-canonicalization of
17244 // identity values.
17246 return;
17247
17249}
17250
17253}
17254
17255LoadInst *
17257 IRBuilder<> Builder(AI);
17258 auto Order = AI->getOrdering();
17259
17260 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17261 // must be flushed if the atomic ordering had a release semantics. This is
17262 // not necessary a fence, a release fence just coincides to do that flush.
17263 // Avoid replacing of an atomicrmw with a release semantics.
17264 if (isReleaseOrStronger(Order))
17265 return nullptr;
17266
17267 LoadInst *LI = Builder.CreateAlignedLoad(
17268 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17269 LI->setAtomic(Order, AI->getSyncScopeID());
17270 LI->copyMetadata(*AI);
17271 LI->takeName(AI);
17272 AI->replaceAllUsesWith(LI);
17273 AI->eraseFromParent();
17274 return LI;
17275}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1122
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
bool isNegative() const
Definition: APFloat.h:1445
APInt bitcastToAPInt() const
Definition: APFloat.h:1351
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
bool isInfinity() const
Definition: APFloat.h:1442
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1451
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2561
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1814
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2434
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1163
LLVMContext & getContext() const
Definition: IRBuilder.h:195
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1157
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1833
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2156
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:750
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:982
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
const Pass * getPass() const
Definition: SelectionDAG.h:493
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:856
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:827
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:712
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:873
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1490
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1639
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1606
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1586
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1609
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:235
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals