LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426 Custom);
427
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
433 Custom);
434
435 // Deal with vec3 vector operations when widened to vec4.
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
438
439 // Deal with vec5/6/7 vector operations when widened to vec8.
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
450
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
454 Expand);
455
456 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
457
458 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
459
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
461 // illegal.
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
464
465 // On SI this is s_memtime and s_memrealtime on VI.
467
468 if (Subtarget->hasSMemRealTime() ||
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
632 case ISD::IS_FPCLASS:
633 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
744 Subtarget->hasVOP3PInsts() ? Legal : Custom);
745
746 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
750
753
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
788 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
789 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
790 Custom);
791
792 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
793 // Split vector operations.
798 VT, Custom);
799
800 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
801 // Split vector operations.
803 VT, Custom);
804
805 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
806 Custom);
807
808 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
809 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
810 Custom);
811
812 if (Subtarget->hasPackedFP32Ops()) {
814 MVT::v2f32, Legal);
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
817 Custom);
818 }
819 }
820
822
823 if (Subtarget->has16BitInsts()) {
825 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
827 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
828 } else {
829 // Legalization hack.
830 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
831
833 }
834
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
840 Custom);
841
843
844 if (Subtarget->hasScalarSMulU64())
846
847 if (Subtarget->hasMad64_32())
849
850 if (Subtarget->hasPrefetch())
852
853 if (Subtarget->hasIEEEMinMax()) {
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
858 Custom);
859 } else {
860 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
861 if (Subtarget->hasMinimum3Maximum3F32())
863
864 if (Subtarget->hasMinimum3Maximum3PKF16())
866 }
867
869 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
870 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
871 MVT::i8},
872 Custom);
873
875 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
876 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
877 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
878 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
879 Custom);
880
882 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
883 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
884 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
885 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
886 Custom);
887
893
894 // TODO: Could move this to custom lowering, could benefit from combines on
895 // extract of relevant bits.
897
899
900 if (Subtarget->hasBF16ConversionInsts()) {
904 }
905
906 if (Subtarget->hasCvtPkF16F32Inst()) {
908 }
909
912 ISD::SUB,
914 ISD::MUL,
915 ISD::FADD,
916 ISD::FSUB,
917 ISD::FDIV,
918 ISD::FMUL,
925 ISD::FMA,
926 ISD::SMIN,
927 ISD::SMAX,
928 ISD::UMIN,
929 ISD::UMAX,
932 ISD::SMIN,
933 ISD::SMAX,
934 ISD::UMIN,
935 ISD::UMAX,
936 ISD::AND,
937 ISD::OR,
938 ISD::XOR,
939 ISD::SHL,
940 ISD::SRL,
941 ISD::SRA,
942 ISD::FSHR,
952
953 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
955
956 // All memory operations. Some folding on the pointer operand is done to help
957 // matching the constant offsets in the addressing modes.
982
983 // FIXME: In other contexts we pretend this is a per-function property.
985
987}
988
989const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
990
992 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
993 return RCRegs;
994}
995
996//===----------------------------------------------------------------------===//
997// TargetLowering queries
998//===----------------------------------------------------------------------===//
999
1000// v_mad_mix* support a conversion from f16 to f32.
1001//
1002// There is only one special case when denormals are enabled we don't currently,
1003// where this is OK to use.
1004bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1005 EVT DestVT, EVT SrcVT) const {
1006 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1007 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1008 DestVT.getScalarType() == MVT::f32 &&
1009 SrcVT.getScalarType() == MVT::f16 &&
1010 // TODO: This probably only requires no input flushing?
1012}
1013
1015 LLT DestTy, LLT SrcTy) const {
1016 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1017 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1018 DestTy.getScalarSizeInBits() == 32 &&
1019 SrcTy.getScalarSizeInBits() == 16 &&
1020 // TODO: This probably only requires no input flushing?
1021 denormalModeIsFlushAllF32(*MI.getMF());
1022}
1023
1025 // SI has some legal vector types, but no legal vector operations. Say no
1026 // shuffles are legal in order to prefer scalarizing some vector operations.
1027 return false;
1028}
1029
1032 EVT VT) const {
1035
1036 if (VT.isVector()) {
1037 EVT ScalarVT = VT.getScalarType();
1038 unsigned Size = ScalarVT.getSizeInBits();
1039 if (Size == 16) {
1040 if (Subtarget->has16BitInsts()) {
1041 if (VT.isInteger())
1042 return MVT::v2i16;
1043 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1044 }
1045 return VT.isInteger() ? MVT::i32 : MVT::f32;
1046 }
1047
1048 if (Size < 16)
1049 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1050 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1051 }
1052
1053 if (VT.getSizeInBits() > 32)
1054 return MVT::i32;
1055
1057}
1058
1061 EVT VT) const {
1064
1065 if (VT.isVector()) {
1066 unsigned NumElts = VT.getVectorNumElements();
1067 EVT ScalarVT = VT.getScalarType();
1068 unsigned Size = ScalarVT.getSizeInBits();
1069
1070 // FIXME: Should probably promote 8-bit vectors to i16.
1071 if (Size == 16 && Subtarget->has16BitInsts())
1072 return (NumElts + 1) / 2;
1073
1074 if (Size <= 32)
1075 return NumElts;
1076
1077 if (Size > 32)
1078 return NumElts * ((Size + 31) / 32);
1079 } else if (VT.getSizeInBits() > 32)
1080 return (VT.getSizeInBits() + 31) / 32;
1081
1083}
1084
1086 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1087 unsigned &NumIntermediates, MVT &RegisterVT) const {
1088 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1089 unsigned NumElts = VT.getVectorNumElements();
1090 EVT ScalarVT = VT.getScalarType();
1091 unsigned Size = ScalarVT.getSizeInBits();
1092 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1093 // support, but unless we can properly handle 3-vectors, it will be still be
1094 // inconsistent.
1095 if (Size == 16 && Subtarget->has16BitInsts()) {
1096 if (ScalarVT == MVT::bf16) {
1097 RegisterVT = MVT::i32;
1098 IntermediateVT = MVT::v2bf16;
1099 } else {
1100 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1101 IntermediateVT = RegisterVT;
1102 }
1103 NumIntermediates = (NumElts + 1) / 2;
1104 return NumIntermediates;
1105 }
1106
1107 if (Size == 32) {
1108 RegisterVT = ScalarVT.getSimpleVT();
1109 IntermediateVT = RegisterVT;
1110 NumIntermediates = NumElts;
1111 return NumIntermediates;
1112 }
1113
1114 if (Size < 16 && Subtarget->has16BitInsts()) {
1115 // FIXME: Should probably form v2i16 pieces
1116 RegisterVT = MVT::i16;
1117 IntermediateVT = ScalarVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1120 }
1121
1122 if (Size != 16 && Size <= 32) {
1123 RegisterVT = MVT::i32;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1127 }
1128
1129 if (Size > 32) {
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = RegisterVT;
1132 NumIntermediates = NumElts * ((Size + 31) / 32);
1133 return NumIntermediates;
1134 }
1135 }
1136
1138 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1139}
1140
1142 const DataLayout &DL, Type *Ty,
1143 unsigned MaxNumLanes) {
1144 assert(MaxNumLanes != 0);
1145
1146 LLVMContext &Ctx = Ty->getContext();
1147 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1148 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1149 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1150 NumElts);
1151 }
1152
1153 return TLI.getValueType(DL, Ty);
1154}
1155
1156// Peek through TFE struct returns to only use the data size.
1158 const DataLayout &DL, Type *Ty,
1159 unsigned MaxNumLanes) {
1160 auto *ST = dyn_cast<StructType>(Ty);
1161 if (!ST)
1162 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1163
1164 // TFE intrinsics return an aggregate type.
1165 assert(ST->getNumContainedTypes() == 2 &&
1166 ST->getContainedType(1)->isIntegerTy(32));
1167 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1168}
1169
1170/// Map address space 7 to MVT::v5i32 because that's its in-memory
1171/// representation. This return value is vector-typed because there is no
1172/// MVT::i160 and it is not clear if one can be added. While this could
1173/// cause issues during codegen, these address space 7 pointers will be
1174/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1175/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1176/// modeling, to work.
1178 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1179 return MVT::v5i32;
1181 DL.getPointerSizeInBits(AS) == 192)
1182 return MVT::v6i32;
1184}
1185/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1186/// v8i32 when padding is added.
1187/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1188/// also v8i32 with padding.
1190 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1191 DL.getPointerSizeInBits(AS) == 160) ||
1193 DL.getPointerSizeInBits(AS) == 192))
1194 return MVT::v8i32;
1196}
1197
1199 const CallInst &CI,
1200 MachineFunction &MF,
1201 unsigned IntrID) const {
1203 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1205 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1207 Info.flags |= getTargetMMOFlags(CI);
1208
1209 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1211 AttributeList Attr =
1213 MemoryEffects ME = Attr.getMemoryEffects();
1214 if (ME.doesNotAccessMemory())
1215 return false;
1216
1217 // TODO: Should images get their own address space?
1218 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1219
1220 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1221 if (RsrcIntr->IsImage) {
1224 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1225 Info.align.reset();
1226 }
1227
1228 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1229 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1230 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1231 // We conservatively set the memory operand of a buffer intrinsic to the
1232 // base resource pointer, so that we can access alias information about
1233 // those pointers. Cases like "this points at the same value
1234 // but with a different offset" are handled in
1235 // areMemAccessesTriviallyDisjoint.
1236 Info.ptrVal = RsrcArg;
1237 }
1238
1239 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1240 if (!IsSPrefetch) {
1241 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1242 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1244 }
1245
1247 if (ME.onlyReadsMemory()) {
1248 if (RsrcIntr->IsImage) {
1249 unsigned MaxNumLanes = 4;
1250
1251 if (!BaseOpcode->Gather4) {
1252 // If this isn't a gather, we may have excess loaded elements in the
1253 // IR type. Check the dmask for the real number of elements loaded.
1254 unsigned DMask =
1255 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1256 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1257 }
1258
1259 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1260 CI.getType(), MaxNumLanes);
1261 } else {
1262 Info.memVT =
1264 std::numeric_limits<unsigned>::max());
1265 }
1266
1267 // FIXME: What does alignment mean for an image?
1270 } else if (ME.onlyWritesMemory()) {
1272
1273 Type *DataTy = CI.getArgOperand(0)->getType();
1274 if (RsrcIntr->IsImage) {
1275 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1276 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1277 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1278 DMaskLanes);
1279 } else
1280 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1281
1283 } else {
1284 // Atomic, NoReturn Sampler or prefetch
1287 Info.flags |=
1289
1290 if (!IsSPrefetch)
1292
1293 switch (IntrID) {
1294 default:
1295 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1296 // Fake memory access type for no return sampler intrinsics
1297 Info.memVT = MVT::i32;
1298 } else {
1299 // XXX - Should this be volatile without known ordering?
1301 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1302 }
1303 break;
1304 case Intrinsic::amdgcn_raw_buffer_load_lds:
1305 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1306 case Intrinsic::amdgcn_struct_buffer_load_lds:
1307 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1308 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1309 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1310 Info.ptrVal = CI.getArgOperand(1);
1311 return true;
1312 }
1313 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1314 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1315 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1316 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1317 Info.memVT =
1319 std::numeric_limits<unsigned>::max());
1320 Info.flags &= ~MachineMemOperand::MOStore;
1321 return true;
1322 }
1323 }
1324 }
1325 return true;
1326 }
1327
1328 switch (IntrID) {
1329 case Intrinsic::amdgcn_ds_ordered_add:
1330 case Intrinsic::amdgcn_ds_ordered_swap: {
1332 Info.memVT = MVT::getVT(CI.getType());
1333 Info.ptrVal = CI.getOperand(0);
1334 Info.align.reset();
1336
1337 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1338 if (!Vol->isZero())
1340
1341 return true;
1342 }
1343 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1344 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1346 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1347 Info.ptrVal = nullptr;
1348 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1350 return true;
1351 }
1352 case Intrinsic::amdgcn_ds_append:
1353 case Intrinsic::amdgcn_ds_consume: {
1355 Info.memVT = MVT::getVT(CI.getType());
1356 Info.ptrVal = CI.getOperand(0);
1357 Info.align.reset();
1359
1360 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1361 if (!Vol->isZero())
1363
1364 return true;
1365 }
1366 case Intrinsic::amdgcn_global_atomic_csub: {
1368 Info.memVT = MVT::getVT(CI.getType());
1369 Info.ptrVal = CI.getOperand(0);
1370 Info.align.reset();
1373 return true;
1374 }
1375 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1377 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1378
1379 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1380 Info.align.reset();
1381 Info.flags |=
1383 return true;
1384 }
1385 case Intrinsic::amdgcn_global_atomic_fmin_num:
1386 case Intrinsic::amdgcn_global_atomic_fmax_num:
1387 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1388 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1389 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1390 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1392 Info.memVT = MVT::getVT(CI.getType());
1393 Info.ptrVal = CI.getOperand(0);
1394 Info.align.reset();
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_global_load_tr_b64:
1401 case Intrinsic::amdgcn_global_load_tr_b128:
1402 case Intrinsic::amdgcn_ds_read_tr4_b64:
1403 case Intrinsic::amdgcn_ds_read_tr6_b96:
1404 case Intrinsic::amdgcn_ds_read_tr8_b64:
1405 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1407 Info.memVT = MVT::getVT(CI.getType());
1408 Info.ptrVal = CI.getOperand(0);
1409 Info.align.reset();
1411 return true;
1412 }
1413 case Intrinsic::amdgcn_ds_gws_init:
1414 case Intrinsic::amdgcn_ds_gws_barrier:
1415 case Intrinsic::amdgcn_ds_gws_sema_v:
1416 case Intrinsic::amdgcn_ds_gws_sema_br:
1417 case Intrinsic::amdgcn_ds_gws_sema_p:
1418 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1420
1421 const GCNTargetMachine &TM =
1422 static_cast<const GCNTargetMachine &>(getTargetMachine());
1423
1425 Info.ptrVal = MFI->getGWSPSV(TM);
1426
1427 // This is an abstract access, but we need to specify a type and size.
1428 Info.memVT = MVT::i32;
1429 Info.size = 4;
1430 Info.align = Align(4);
1431
1432 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1434 else
1436 return true;
1437 }
1438 case Intrinsic::amdgcn_global_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1444 return true;
1445 }
1446 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1448
1449 const GCNTargetMachine &TM =
1450 static_cast<const GCNTargetMachine &>(getTargetMachine());
1451
1453 Info.ptrVal = MFI->getGWSPSV(TM);
1454
1455 // This is an abstract access, but we need to specify a type and size.
1456 Info.memVT = MVT::i32;
1457 Info.size = 4;
1458 Info.align = Align(4);
1459
1461 return true;
1462 }
1463 case Intrinsic::amdgcn_s_prefetch_data: {
1465 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1466 Info.ptrVal = CI.getArgOperand(0);
1468 return true;
1469 }
1470 default:
1471 return false;
1472 }
1473}
1474
1476 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1477 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1478 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1479 // The DAG's ValueType loses the addrspaces.
1480 // Add them as 2 extra Constant operands "from" and "to".
1481 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1482 unsigned DstAS = I.getType()->getPointerAddressSpace();
1483 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1484 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1485 break;
1486 }
1487 default:
1488 break;
1489 }
1490}
1491
1494 Type *&AccessTy) const {
1495 Value *Ptr = nullptr;
1496 switch (II->getIntrinsicID()) {
1497 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume:
1500 case Intrinsic::amdgcn_ds_read_tr4_b64:
1501 case Intrinsic::amdgcn_ds_read_tr6_b96:
1502 case Intrinsic::amdgcn_ds_read_tr8_b64:
1503 case Intrinsic::amdgcn_ds_read_tr16_b64:
1504 case Intrinsic::amdgcn_ds_ordered_add:
1505 case Intrinsic::amdgcn_ds_ordered_swap:
1506 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1507 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1508 case Intrinsic::amdgcn_global_atomic_csub:
1509 case Intrinsic::amdgcn_global_atomic_fmax_num:
1510 case Intrinsic::amdgcn_global_atomic_fmin_num:
1511 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1512 case Intrinsic::amdgcn_global_load_tr_b64:
1513 case Intrinsic::amdgcn_global_load_tr_b128:
1514 Ptr = II->getArgOperand(0);
1515 break;
1516 case Intrinsic::amdgcn_global_load_lds:
1517 Ptr = II->getArgOperand(1);
1518 break;
1519 default:
1520 return false;
1521 }
1522 AccessTy = II->getType();
1523 Ops.push_back(Ptr);
1524 return true;
1525}
1526
1528 unsigned AddrSpace) const {
1529 if (!Subtarget->hasFlatInstOffsets()) {
1530 // Flat instructions do not have offsets, and only have the register
1531 // address.
1532 return AM.BaseOffs == 0 && AM.Scale == 0;
1533 }
1534
1535 decltype(SIInstrFlags::FLAT) FlatVariant =
1539
1540 return AM.Scale == 0 &&
1541 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1542 AM.BaseOffs, AddrSpace, FlatVariant));
1543}
1544
1546 if (Subtarget->hasFlatGlobalInsts())
1548
1549 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1550 // Assume the we will use FLAT for all global memory accesses
1551 // on VI.
1552 // FIXME: This assumption is currently wrong. On VI we still use
1553 // MUBUF instructions for the r + i addressing mode. As currently
1554 // implemented, the MUBUF instructions only work on buffer < 4GB.
1555 // It may be possible to support > 4GB buffers with MUBUF instructions,
1556 // by setting the stride value in the resource descriptor which would
1557 // increase the size limit to (stride * 4GB). However, this is risky,
1558 // because it has never been validated.
1560 }
1561
1562 return isLegalMUBUFAddressingMode(AM);
1563}
1564
1565bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1566 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1567 // additionally can do r + r + i with addr64. 32-bit has more addressing
1568 // mode options. Depending on the resource constant, it can also do
1569 // (i64 r0) + (i32 r1) * (i14 i).
1570 //
1571 // Private arrays end up using a scratch buffer most of the time, so also
1572 // assume those use MUBUF instructions. Scratch loads / stores are currently
1573 // implemented as mubuf instructions with offen bit set, so slightly
1574 // different than the normal addr64.
1575 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1576 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1577 return false;
1578
1579 // FIXME: Since we can split immediate into soffset and immediate offset,
1580 // would it make sense to allow any immediate?
1581
1582 switch (AM.Scale) {
1583 case 0: // r + i or just i, depending on HasBaseReg.
1584 return true;
1585 case 1:
1586 return true; // We have r + r or r + i.
1587 case 2:
1588 if (AM.HasBaseReg) {
1589 // Reject 2 * r + r.
1590 return false;
1591 }
1592
1593 // Allow 2 * r as r + r
1594 // Or 2 * r + i is allowed as r + r + i.
1595 return true;
1596 default: // Don't allow n * r
1597 return false;
1598 }
1599}
1600
1602 const AddrMode &AM, Type *Ty,
1603 unsigned AS,
1604 Instruction *I) const {
1605 // No global is ever allowed as a base.
1606 if (AM.BaseGV)
1607 return false;
1608
1609 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1610 return isLegalGlobalAddressingMode(AM);
1611
1612 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1616 // If the offset isn't a multiple of 4, it probably isn't going to be
1617 // correctly aligned.
1618 // FIXME: Can we get the real alignment here?
1619 if (AM.BaseOffs % 4 != 0)
1620 return isLegalMUBUFAddressingMode(AM);
1621
1622 if (!Subtarget->hasScalarSubwordLoads()) {
1623 // There are no SMRD extloads, so if we have to do a small type access we
1624 // will use a MUBUF load.
1625 // FIXME?: We also need to do this if unaligned, but we don't know the
1626 // alignment here.
1627 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1628 return isLegalGlobalAddressingMode(AM);
1629 }
1630
1632 // SMRD instructions have an 8-bit, dword offset on SI.
1633 if (!isUInt<8>(AM.BaseOffs / 4))
1634 return false;
1635 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1636 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1637 // in 8-bits, it can use a smaller encoding.
1638 if (!isUInt<32>(AM.BaseOffs / 4))
1639 return false;
1640 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1641 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1642 if (!isUInt<20>(AM.BaseOffs))
1643 return false;
1644 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1645 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1646 // for S_BUFFER_* instructions).
1647 if (!isInt<21>(AM.BaseOffs))
1648 return false;
1649 } else {
1650 // On GFX12, all offsets are signed 24-bit in bytes.
1651 if (!isInt<24>(AM.BaseOffs))
1652 return false;
1653 }
1654
1655 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1657 AM.BaseOffs < 0) {
1658 // Scalar (non-buffer) loads can only use a negative offset if
1659 // soffset+offset is non-negative. Since the compiler can only prove that
1660 // in a few special cases, it is safer to claim that negative offsets are
1661 // not supported.
1662 return false;
1663 }
1664
1665 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1666 return true;
1667
1668 if (AM.Scale == 1 && AM.HasBaseReg)
1669 return true;
1670
1671 return false;
1672 }
1673
1674 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1675 return Subtarget->enableFlatScratch()
1677 : isLegalMUBUFAddressingMode(AM);
1678
1679 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1680 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1681 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1682 // field.
1683 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1684 // an 8-bit dword offset but we don't know the alignment here.
1685 if (!isUInt<16>(AM.BaseOffs))
1686 return false;
1687
1688 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1689 return true;
1690
1691 if (AM.Scale == 1 && AM.HasBaseReg)
1692 return true;
1693
1694 return false;
1695 }
1696
1698 // For an unknown address space, this usually means that this is for some
1699 // reason being used for pure arithmetic, and not based on some addressing
1700 // computation. We don't have instructions that compute pointers with any
1701 // addressing modes, so treat them as having no offset like flat
1702 // instructions.
1704 }
1705
1706 // Assume a user alias of global for unknown address spaces.
1707 return isLegalGlobalAddressingMode(AM);
1708}
1709
1711 const MachineFunction &MF) const {
1713 return (MemVT.getSizeInBits() <= 4 * 32);
1714 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1715 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1716 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1717 }
1719 return (MemVT.getSizeInBits() <= 2 * 32);
1720 return true;
1721}
1722
1724 unsigned Size, unsigned AddrSpace, Align Alignment,
1725 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1726 if (IsFast)
1727 *IsFast = 0;
1728
1729 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1730 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1731 // Check if alignment requirements for ds_read/write instructions are
1732 // disabled.
1733 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1734 return false;
1735
1736 Align RequiredAlignment(
1737 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1738 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1739 Alignment < RequiredAlignment)
1740 return false;
1741
1742 // Either, the alignment requirements are "enabled", or there is an
1743 // unaligned LDS access related hardware bug though alignment requirements
1744 // are "disabled". In either case, we need to check for proper alignment
1745 // requirements.
1746 //
1747 switch (Size) {
1748 case 64:
1749 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1750 // address is negative, then the instruction is incorrectly treated as
1751 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1752 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1753 // load later in the SILoadStoreOptimizer.
1754 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1755 return false;
1756
1757 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1758 // can do a 4 byte aligned, 8 byte access in a single operation using
1759 // ds_read2/write2_b32 with adjacent offsets.
1760 RequiredAlignment = Align(4);
1761
1762 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1763 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1764 // ds_write2_b32 depending on the alignment. In either case with either
1765 // alignment there is no faster way of doing this.
1766
1767 // The numbers returned here and below are not additive, it is a 'speed
1768 // rank'. They are just meant to be compared to decide if a certain way
1769 // of lowering an operation is faster than another. For that purpose
1770 // naturally aligned operation gets it bitsize to indicate that "it
1771 // operates with a speed comparable to N-bit wide load". With the full
1772 // alignment ds128 is slower than ds96 for example. If underaligned it
1773 // is comparable to a speed of a single dword access, which would then
1774 // mean 32 < 128 and it is faster to issue a wide load regardless.
1775 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1776 // wider load which will not be aligned anymore the latter is slower.
1777 if (IsFast)
1778 *IsFast = (Alignment >= RequiredAlignment) ? 64
1779 : (Alignment < Align(4)) ? 32
1780 : 1;
1781 return true;
1782 }
1783
1784 break;
1785 case 96:
1786 if (!Subtarget->hasDS96AndDS128())
1787 return false;
1788
1789 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1790 // gfx8 and older.
1791
1792 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1793 // Naturally aligned access is fastest. However, also report it is Fast
1794 // if memory is aligned less than DWORD. A narrow load or store will be
1795 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1796 // be more of them, so overall we will pay less penalty issuing a single
1797 // instruction.
1798
1799 // See comment on the values above.
1800 if (IsFast)
1801 *IsFast = (Alignment >= RequiredAlignment) ? 96
1802 : (Alignment < Align(4)) ? 32
1803 : 1;
1804 return true;
1805 }
1806
1807 break;
1808 case 128:
1809 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1810 return false;
1811
1812 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1813 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1814 // single operation using ds_read2/write2_b64.
1815 RequiredAlignment = Align(8);
1816
1817 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1818 // Naturally aligned access is fastest. However, also report it is Fast
1819 // if memory is aligned less than DWORD. A narrow load or store will be
1820 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1821 // will be more of them, so overall we will pay less penalty issuing a
1822 // single instruction.
1823
1824 // See comment on the values above.
1825 if (IsFast)
1826 *IsFast = (Alignment >= RequiredAlignment) ? 128
1827 : (Alignment < Align(4)) ? 32
1828 : 1;
1829 return true;
1830 }
1831
1832 break;
1833 default:
1834 if (Size > 32)
1835 return false;
1836
1837 break;
1838 }
1839
1840 // See comment on the values above.
1841 // Note that we have a single-dword or sub-dword here, so if underaligned
1842 // it is a slowest possible access, hence returned value is 0.
1843 if (IsFast)
1844 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1845
1846 return Alignment >= RequiredAlignment ||
1847 Subtarget->hasUnalignedDSAccessEnabled();
1848 }
1849
1850 // FIXME: We have to be conservative here and assume that flat operations
1851 // will access scratch. If we had access to the IR function, then we
1852 // could determine if any private memory was used in the function.
1853 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1854 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1855 bool AlignedBy4 = Alignment >= Align(4);
1856 if (IsFast)
1857 *IsFast = AlignedBy4;
1858
1859 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1860 }
1861
1862 // So long as they are correct, wide global memory operations perform better
1863 // than multiple smaller memory ops -- even when misaligned
1864 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1865 if (IsFast)
1866 *IsFast = Size;
1867
1868 return Alignment >= Align(4) ||
1870 }
1871
1872 // Smaller than dword value must be aligned.
1873 if (Size < 32)
1874 return false;
1875
1876 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1877 // byte-address are ignored, thus forcing Dword alignment.
1878 // This applies to private, global, and constant memory.
1879 if (IsFast)
1880 *IsFast = 1;
1881
1882 return Size >= 32 && Alignment >= Align(4);
1883}
1884
1886 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1887 unsigned *IsFast) const {
1889 Alignment, Flags, IsFast);
1890}
1891
1893 const MemOp &Op, const AttributeList &FuncAttributes) const {
1894 // FIXME: Should account for address space here.
1895
1896 // The default fallback uses the private pointer size as a guess for a type to
1897 // use. Make sure we switch these to 64-bit accesses.
1898
1899 if (Op.size() >= 16 &&
1900 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1901 return MVT::v4i32;
1902
1903 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1904 return MVT::v2i32;
1905
1906 // Use the default.
1907 return MVT::Other;
1908}
1909
1911 const MemSDNode *MemNode = cast<MemSDNode>(N);
1912 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1913}
1914
1916 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1918}
1919
1921 unsigned DestAS) const {
1922 // Flat -> private/local is a simple truncate.
1923 // Flat -> global is no-op
1924 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1925 return true;
1926
1927 const GCNTargetMachine &TM =
1928 static_cast<const GCNTargetMachine &>(getTargetMachine());
1929 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1930}
1931
1934 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1935 VT.getScalarType().bitsLE(MVT::i16))
1938}
1939
1941 Type *Ty) const {
1942 // FIXME: Could be smarter if called for vector constants.
1943 return true;
1944}
1945
1947 unsigned Index) const {
1949 return false;
1950
1951 // TODO: Add more cases that are cheap.
1952 return Index == 0;
1953}
1954
1956 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1957 switch (Op) {
1958 case ISD::LOAD:
1959 case ISD::STORE:
1960 return true;
1961 default:
1962 return false;
1963 }
1964 }
1965
1966 // SimplifySetCC uses this function to determine whether or not it should
1967 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1968 if (VT == MVT::i1 && Op == ISD::SETCC)
1969 return false;
1970
1972}
1973
1974SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1975 const SDLoc &SL,
1976 SDValue Chain,
1977 uint64_t Offset) const {
1978 const DataLayout &DL = DAG.getDataLayout();
1982
1983 auto [InputPtrReg, RC, ArgTy] =
1985
1986 // We may not have the kernarg segment argument if we have no kernel
1987 // arguments.
1988 if (!InputPtrReg)
1989 return DAG.getConstant(Offset, SL, PtrVT);
1990
1992 SDValue BasePtr = DAG.getCopyFromReg(
1993 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1994
1995 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1996}
1997
1998SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1999 const SDLoc &SL) const {
2002 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2003}
2004
2005SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2006 const SDLoc &SL) const {
2007
2009 std::optional<uint32_t> KnownSize =
2011 if (KnownSize.has_value())
2012 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2013 return SDValue();
2014}
2015
2016SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2017 const SDLoc &SL, SDValue Val,
2018 bool Signed,
2019 const ISD::InputArg *Arg) const {
2020 // First, if it is a widened vector, narrow it.
2021 if (VT.isVector() &&
2023 EVT NarrowedVT =
2026 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2027 DAG.getConstant(0, SL, MVT::i32));
2028 }
2029
2030 // Then convert the vector elements or scalar value.
2031 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2032 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2033 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2034 }
2035
2036 if (MemVT.isFloatingPoint())
2037 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2038 else if (Signed)
2039 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2040 else
2041 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2042
2043 return Val;
2044}
2045
2046SDValue SITargetLowering::lowerKernargMemParameter(
2047 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2048 uint64_t Offset, Align Alignment, bool Signed,
2049 const ISD::InputArg *Arg) const {
2051
2052 // Try to avoid using an extload by loading earlier than the argument address,
2053 // and extracting the relevant bits. The load should hopefully be merged with
2054 // the previous argument.
2055 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2056 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2057 int64_t AlignDownOffset = alignDown(Offset, 4);
2058 int64_t OffsetDiff = Offset - AlignDownOffset;
2059
2060 EVT IntVT = MemVT.changeTypeToInteger();
2061
2062 // TODO: If we passed in the base kernel offset we could have a better
2063 // alignment than 4, but we don't really need it.
2064 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2065 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2068
2069 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2070 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2071
2072 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2073 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2074 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2075
2076 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2077 }
2078
2079 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2080 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2083
2084 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2085 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2086}
2087
2088SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2089 CCValAssign &VA, const SDLoc &SL,
2090 SDValue Chain,
2091 const ISD::InputArg &Arg) const {
2093 MachineFrameInfo &MFI = MF.getFrameInfo();
2094
2095 if (Arg.Flags.isByVal()) {
2096 unsigned Size = Arg.Flags.getByValSize();
2097 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2098 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2099 }
2100
2101 unsigned ArgOffset = VA.getLocMemOffset();
2102 unsigned ArgSize = VA.getValVT().getStoreSize();
2103
2104 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2105
2106 // Create load nodes to retrieve arguments from the stack.
2107 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2108 SDValue ArgValue;
2109
2110 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2112 MVT MemVT = VA.getValVT();
2113
2114 switch (VA.getLocInfo()) {
2115 default:
2116 break;
2117 case CCValAssign::BCvt:
2118 MemVT = VA.getLocVT();
2119 break;
2120 case CCValAssign::SExt:
2121 ExtType = ISD::SEXTLOAD;
2122 break;
2123 case CCValAssign::ZExt:
2124 ExtType = ISD::ZEXTLOAD;
2125 break;
2126 case CCValAssign::AExt:
2127 ExtType = ISD::EXTLOAD;
2128 break;
2129 }
2130
2131 ArgValue = DAG.getExtLoad(
2132 ExtType, SL, VA.getLocVT(), Chain, FIN,
2134 return ArgValue;
2135}
2136
2137SDValue SITargetLowering::getPreloadedValue(
2138 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2140 const ArgDescriptor *Reg = nullptr;
2141 const TargetRegisterClass *RC;
2142 LLT Ty;
2143
2145 const ArgDescriptor WorkGroupIDX =
2146 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2147 // If GridZ is not programmed in an entry function then the hardware will set
2148 // it to all zeros, so there is no need to mask the GridY value in the low
2149 // order bits.
2150 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2151 AMDGPU::TTMP7,
2152 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2153 const ArgDescriptor WorkGroupIDZ =
2154 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2155 if (Subtarget->hasArchitectedSGPRs() &&
2157 switch (PVID) {
2159 Reg = &WorkGroupIDX;
2160 RC = &AMDGPU::SReg_32RegClass;
2161 Ty = LLT::scalar(32);
2162 break;
2164 Reg = &WorkGroupIDY;
2165 RC = &AMDGPU::SReg_32RegClass;
2166 Ty = LLT::scalar(32);
2167 break;
2169 Reg = &WorkGroupIDZ;
2170 RC = &AMDGPU::SReg_32RegClass;
2171 Ty = LLT::scalar(32);
2172 break;
2173 default:
2174 break;
2175 }
2176 }
2177
2178 if (!Reg)
2179 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2180 if (!Reg) {
2182 // It's possible for a kernarg intrinsic call to appear in a kernel with
2183 // no allocated segment, in which case we do not add the user sgpr
2184 // argument, so just return null.
2185 return DAG.getConstant(0, SDLoc(), VT);
2186 }
2187
2188 // It's undefined behavior if a function marked with the amdgpu-no-*
2189 // attributes uses the corresponding intrinsic.
2190 return DAG.getUNDEF(VT);
2191 }
2192
2193 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2194}
2195
2197 CallingConv::ID CallConv,
2198 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2199 FunctionType *FType,
2200 SIMachineFunctionInfo *Info) {
2201 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2202 const ISD::InputArg *Arg = &Ins[I];
2203
2204 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2205 "vector type argument should have been split");
2206
2207 // First check if it's a PS input addr.
2208 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2209 PSInputNum <= 15) {
2210 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2211
2212 // Inconveniently only the first part of the split is marked as isSplit,
2213 // so skip to the end. We only want to increment PSInputNum once for the
2214 // entire split argument.
2215 if (Arg->Flags.isSplit()) {
2216 while (!Arg->Flags.isSplitEnd()) {
2217 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2218 "unexpected vector split in ps argument type");
2219 if (!SkipArg)
2220 Splits.push_back(*Arg);
2221 Arg = &Ins[++I];
2222 }
2223 }
2224
2225 if (SkipArg) {
2226 // We can safely skip PS inputs.
2227 Skipped.set(Arg->getOrigArgIndex());
2228 ++PSInputNum;
2229 continue;
2230 }
2231
2232 Info->markPSInputAllocated(PSInputNum);
2233 if (Arg->Used)
2234 Info->markPSInputEnabled(PSInputNum);
2235
2236 ++PSInputNum;
2237 }
2238
2239 Splits.push_back(*Arg);
2240 }
2241}
2242
2243// Allocate special inputs passed in VGPRs.
2245 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2246 SIMachineFunctionInfo &Info) const {
2247 const LLT S32 = LLT::scalar(32);
2249
2250 if (Info.hasWorkItemIDX()) {
2251 Register Reg = AMDGPU::VGPR0;
2252 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2253
2254 CCInfo.AllocateReg(Reg);
2255 unsigned Mask =
2256 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2257 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2258 }
2259
2260 if (Info.hasWorkItemIDY()) {
2261 assert(Info.hasWorkItemIDX());
2262 if (Subtarget->hasPackedTID()) {
2263 Info.setWorkItemIDY(
2264 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2265 } else {
2266 unsigned Reg = AMDGPU::VGPR1;
2267 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2268
2269 CCInfo.AllocateReg(Reg);
2270 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2271 }
2272 }
2273
2274 if (Info.hasWorkItemIDZ()) {
2275 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2276 if (Subtarget->hasPackedTID()) {
2277 Info.setWorkItemIDZ(
2278 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2279 } else {
2280 unsigned Reg = AMDGPU::VGPR2;
2281 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2282
2283 CCInfo.AllocateReg(Reg);
2284 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2285 }
2286 }
2287}
2288
2289// Try to allocate a VGPR at the end of the argument list, or if no argument
2290// VGPRs are left allocating a stack slot.
2291// If \p Mask is is given it indicates bitfield position in the register.
2292// If \p Arg is given use it with new ]p Mask instead of allocating new.
2293static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2294 ArgDescriptor Arg = ArgDescriptor()) {
2295 if (Arg.isSet())
2296 return ArgDescriptor::createArg(Arg, Mask);
2297
2298 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2299 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2300 if (RegIdx == ArgVGPRs.size()) {
2301 // Spill to stack required.
2302 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2303
2304 return ArgDescriptor::createStack(Offset, Mask);
2305 }
2306
2307 unsigned Reg = ArgVGPRs[RegIdx];
2308 Reg = CCInfo.AllocateReg(Reg);
2309 assert(Reg != AMDGPU::NoRegister);
2310
2311 MachineFunction &MF = CCInfo.getMachineFunction();
2312 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2313 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2314 return ArgDescriptor::createRegister(Reg, Mask);
2315}
2316
2318 const TargetRegisterClass *RC,
2319 unsigned NumArgRegs) {
2320 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2321 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2322 if (RegIdx == ArgSGPRs.size())
2323 report_fatal_error("ran out of SGPRs for arguments");
2324
2325 unsigned Reg = ArgSGPRs[RegIdx];
2326 Reg = CCInfo.AllocateReg(Reg);
2327 assert(Reg != AMDGPU::NoRegister);
2328
2329 MachineFunction &MF = CCInfo.getMachineFunction();
2330 MF.addLiveIn(Reg, RC);
2332}
2333
2334// If this has a fixed position, we still should allocate the register in the
2335// CCInfo state. Technically we could get away with this for values passed
2336// outside of the normal argument range.
2338 const TargetRegisterClass *RC,
2339 MCRegister Reg) {
2340 Reg = CCInfo.AllocateReg(Reg);
2341 assert(Reg != AMDGPU::NoRegister);
2342 MachineFunction &MF = CCInfo.getMachineFunction();
2343 MF.addLiveIn(Reg, RC);
2344}
2345
2346static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2347 if (Arg) {
2348 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2349 Arg.getRegister());
2350 } else
2351 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2352}
2353
2354static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2355 if (Arg) {
2356 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2357 Arg.getRegister());
2358 } else
2359 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2360}
2361
2362/// Allocate implicit function VGPR arguments at the end of allocated user
2363/// arguments.
2365 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2366 SIMachineFunctionInfo &Info) const {
2367 const unsigned Mask = 0x3ff;
2368 ArgDescriptor Arg;
2369
2370 if (Info.hasWorkItemIDX()) {
2371 Arg = allocateVGPR32Input(CCInfo, Mask);
2372 Info.setWorkItemIDX(Arg);
2373 }
2374
2375 if (Info.hasWorkItemIDY()) {
2376 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2377 Info.setWorkItemIDY(Arg);
2378 }
2379
2380 if (Info.hasWorkItemIDZ())
2381 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2382}
2383
2384/// Allocate implicit function VGPR arguments in fixed registers.
2386 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2387 SIMachineFunctionInfo &Info) const {
2388 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2389 if (!Reg)
2390 report_fatal_error("failed to allocated VGPR for implicit arguments");
2391
2392 const unsigned Mask = 0x3ff;
2393 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2394 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2395 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2396}
2397
2399 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2400 SIMachineFunctionInfo &Info) const {
2401 auto &ArgInfo = Info.getArgInfo();
2402 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2403
2404 // TODO: Unify handling with private memory pointers.
2405 if (UserSGPRInfo.hasDispatchPtr())
2406 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2407
2408 if (UserSGPRInfo.hasQueuePtr())
2409 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2410
2411 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2412 // constant offset from the kernarg segment.
2413 if (Info.hasImplicitArgPtr())
2414 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2415
2416 if (UserSGPRInfo.hasDispatchID())
2417 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2418
2419 // flat_scratch_init is not applicable for non-kernel functions.
2420
2421 if (Info.hasWorkGroupIDX())
2422 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2423
2424 if (Info.hasWorkGroupIDY())
2425 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2426
2427 if (Info.hasWorkGroupIDZ())
2428 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2429
2430 if (Info.hasLDSKernelId())
2431 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2432}
2433
2434// Allocate special inputs passed in user SGPRs.
2436 MachineFunction &MF,
2437 const SIRegisterInfo &TRI,
2438 SIMachineFunctionInfo &Info) const {
2439 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2440 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2441 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2442 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2443 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2444 }
2445
2446 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2447 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2448 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2449 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2450 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2451 }
2452
2453 if (UserSGPRInfo.hasDispatchPtr()) {
2454 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2455 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2456 CCInfo.AllocateReg(DispatchPtrReg);
2457 }
2458
2459 if (UserSGPRInfo.hasQueuePtr()) {
2460 Register QueuePtrReg = Info.addQueuePtr(TRI);
2461 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2462 CCInfo.AllocateReg(QueuePtrReg);
2463 }
2464
2465 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2467 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2468 CCInfo.AllocateReg(InputPtrReg);
2469
2470 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2471 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2472 }
2473
2474 if (UserSGPRInfo.hasDispatchID()) {
2475 Register DispatchIDReg = Info.addDispatchID(TRI);
2476 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2477 CCInfo.AllocateReg(DispatchIDReg);
2478 }
2479
2480 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2481 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2482 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2483 CCInfo.AllocateReg(FlatScratchInitReg);
2484 }
2485
2486 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2487 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2488 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2489 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2490 }
2491
2492 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2493 // these from the dispatch pointer.
2494}
2495
2496// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2497// sequential starting from the first argument.
2499 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2501 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2502 Function &F = MF.getFunction();
2503 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2504 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2505 bool InPreloadSequence = true;
2506 unsigned InIdx = 0;
2507 bool AlignedForImplictArgs = false;
2508 unsigned ImplicitArgOffset = 0;
2509 for (auto &Arg : F.args()) {
2510 if (!InPreloadSequence || !Arg.hasInRegAttr())
2511 break;
2512
2513 unsigned ArgIdx = Arg.getArgNo();
2514 // Don't preload non-original args or parts not in the current preload
2515 // sequence.
2516 if (InIdx < Ins.size() &&
2517 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2518 break;
2519
2520 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2521 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2522 InIdx++) {
2523 assert(ArgLocs[ArgIdx].isMemLoc());
2524 auto &ArgLoc = ArgLocs[InIdx];
2525 const Align KernelArgBaseAlign = Align(16);
2526 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2527 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2528 unsigned NumAllocSGPRs =
2529 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2530
2531 // Fix alignment for hidden arguments.
2532 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2533 if (!AlignedForImplictArgs) {
2534 ImplicitArgOffset =
2535 alignTo(LastExplicitArgOffset,
2536 Subtarget->getAlignmentForImplicitArgPtr()) -
2537 LastExplicitArgOffset;
2538 AlignedForImplictArgs = true;
2539 }
2540 ArgOffset += ImplicitArgOffset;
2541 }
2542
2543 // Arg is preloaded into the previous SGPR.
2544 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2545 assert(InIdx >= 1 && "No previous SGPR");
2546 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2547 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2548 continue;
2549 }
2550
2551 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2552 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2553 // Check for free user SGPRs for preloading.
2554 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2555 InPreloadSequence = false;
2556 break;
2557 }
2558
2559 // Preload this argument.
2560 const TargetRegisterClass *RC =
2561 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2562 SmallVectorImpl<MCRegister> *PreloadRegs =
2563 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2564
2565 if (PreloadRegs->size() > 1)
2566 RC = &AMDGPU::SGPR_32RegClass;
2567 for (auto &Reg : *PreloadRegs) {
2568 assert(Reg);
2569 MF.addLiveIn(Reg, RC);
2570 CCInfo.AllocateReg(Reg);
2571 }
2572
2573 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2574 }
2575 }
2576}
2577
2579 const SIRegisterInfo &TRI,
2580 SIMachineFunctionInfo &Info) const {
2581 // Always allocate this last since it is a synthetic preload.
2582 if (Info.hasLDSKernelId()) {
2583 Register Reg = Info.addLDSKernelId();
2584 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2585 CCInfo.AllocateReg(Reg);
2586 }
2587}
2588
2589// Allocate special input registers that are initialized per-wave.
2592 CallingConv::ID CallConv,
2593 bool IsShader) const {
2594 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2595 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2596 // Note: user SGPRs are handled by the front-end for graphics shaders
2597 // Pad up the used user SGPRs with dead inputs.
2598
2599 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2600 // before enabling architected SGPRs for workgroup IDs.
2601 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2602
2603 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2604 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2605 // rely on it to reach 16 since if we end up having no stack usage, it will
2606 // not really be added.
2607 unsigned NumRequiredSystemSGPRs =
2608 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2609 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2610 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2611 Register Reg = Info.addReservedUserSGPR();
2612 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 CCInfo.AllocateReg(Reg);
2614 }
2615 }
2616
2617 if (!HasArchitectedSGPRs) {
2618 if (Info.hasWorkGroupIDX()) {
2619 Register Reg = Info.addWorkGroupIDX();
2620 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2621 CCInfo.AllocateReg(Reg);
2622 }
2623
2624 if (Info.hasWorkGroupIDY()) {
2625 Register Reg = Info.addWorkGroupIDY();
2626 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2627 CCInfo.AllocateReg(Reg);
2628 }
2629
2630 if (Info.hasWorkGroupIDZ()) {
2631 Register Reg = Info.addWorkGroupIDZ();
2632 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633 CCInfo.AllocateReg(Reg);
2634 }
2635 }
2636
2637 if (Info.hasWorkGroupInfo()) {
2638 Register Reg = Info.addWorkGroupInfo();
2639 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2640 CCInfo.AllocateReg(Reg);
2641 }
2642
2643 if (Info.hasPrivateSegmentWaveByteOffset()) {
2644 // Scratch wave offset passed in system SGPR.
2645 unsigned PrivateSegmentWaveByteOffsetReg;
2646
2647 if (IsShader) {
2648 PrivateSegmentWaveByteOffsetReg =
2649 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2650
2651 // This is true if the scratch wave byte offset doesn't have a fixed
2652 // location.
2653 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2654 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2655 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2656 }
2657 } else
2658 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2659
2660 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2661 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2662 }
2663
2664 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2665 Info.getNumPreloadedSGPRs() >= 16);
2666}
2667
2669 MachineFunction &MF,
2670 const SIRegisterInfo &TRI,
2671 SIMachineFunctionInfo &Info) {
2672 // Now that we've figured out where the scratch register inputs are, see if
2673 // should reserve the arguments and use them directly.
2674 MachineFrameInfo &MFI = MF.getFrameInfo();
2675 bool HasStackObjects = MFI.hasStackObjects();
2676 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2677
2678 // Record that we know we have non-spill stack objects so we don't need to
2679 // check all stack objects later.
2680 if (HasStackObjects)
2681 Info.setHasNonSpillStackObjects(true);
2682
2683 // Everything live out of a block is spilled with fast regalloc, so it's
2684 // almost certain that spilling will be required.
2685 if (TM.getOptLevel() == CodeGenOptLevel::None)
2686 HasStackObjects = true;
2687
2688 // For now assume stack access is needed in any callee functions, so we need
2689 // the scratch registers to pass in.
2690 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2691
2692 if (!ST.enableFlatScratch()) {
2693 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2694 // If we have stack objects, we unquestionably need the private buffer
2695 // resource. For the Code Object V2 ABI, this will be the first 4 user
2696 // SGPR inputs. We can reserve those and use them directly.
2697
2698 Register PrivateSegmentBufferReg =
2700 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2701 } else {
2702 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2703 // We tentatively reserve the last registers (skipping the last registers
2704 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2705 // we'll replace these with the ones immediately after those which were
2706 // really allocated. In the prologue copies will be inserted from the
2707 // argument to these reserved registers.
2708
2709 // Without HSA, relocations are used for the scratch pointer and the
2710 // buffer resource setup is always inserted in the prologue. Scratch wave
2711 // offset is still in an input SGPR.
2712 Info.setScratchRSrcReg(ReservedBufferReg);
2713 }
2714 }
2715
2717
2718 // For entry functions we have to set up the stack pointer if we use it,
2719 // whereas non-entry functions get this "for free". This means there is no
2720 // intrinsic advantage to using S32 over S34 in cases where we do not have
2721 // calls but do need a frame pointer (i.e. if we are requested to have one
2722 // because frame pointer elimination is disabled). To keep things simple we
2723 // only ever use S32 as the call ABI stack pointer, and so using it does not
2724 // imply we need a separate frame pointer.
2725 //
2726 // Try to use s32 as the SP, but move it if it would interfere with input
2727 // arguments. This won't work with calls though.
2728 //
2729 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2730 // registers.
2731 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2732 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2733 } else {
2735
2736 if (MFI.hasCalls())
2737 report_fatal_error("call in graphics shader with too many input SGPRs");
2738
2739 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2740 if (!MRI.isLiveIn(Reg)) {
2741 Info.setStackPtrOffsetReg(Reg);
2742 break;
2743 }
2744 }
2745
2746 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2747 report_fatal_error("failed to find register for SP");
2748 }
2749
2750 // hasFP should be accurate for entry functions even before the frame is
2751 // finalized, because it does not rely on the known stack size, only
2752 // properties like whether variable sized objects are present.
2753 if (ST.getFrameLowering()->hasFP(MF)) {
2754 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2755 }
2756}
2757
2760 return !Info->isEntryFunction();
2761}
2762
2764
2766 MachineBasicBlock *Entry,
2767 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2769
2770 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2771 if (!IStart)
2772 return;
2773
2774 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2775 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2776 MachineBasicBlock::iterator MBBI = Entry->begin();
2777 for (const MCPhysReg *I = IStart; *I; ++I) {
2778 const TargetRegisterClass *RC = nullptr;
2779 if (AMDGPU::SReg_64RegClass.contains(*I))
2780 RC = &AMDGPU::SGPR_64RegClass;
2781 else if (AMDGPU::SReg_32RegClass.contains(*I))
2782 RC = &AMDGPU::SGPR_32RegClass;
2783 else
2784 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2785
2786 Register NewVR = MRI->createVirtualRegister(RC);
2787 // Create copy from CSR to a virtual register.
2788 Entry->addLiveIn(*I);
2789 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2790 .addReg(*I);
2791
2792 // Insert the copy-back instructions right before the terminator.
2793 for (auto *Exit : Exits)
2794 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2795 TII->get(TargetOpcode::COPY), *I)
2796 .addReg(NewVR);
2797 }
2798}
2799
2801 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2802 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2803 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2805
2807 const Function &Fn = MF.getFunction();
2810
2811 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2812 DiagnosticInfoUnsupported NoGraphicsHSA(
2813 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2814 DAG.getContext()->diagnose(NoGraphicsHSA);
2815 return DAG.getEntryNode();
2816 }
2817
2820 BitVector Skipped(Ins.size());
2821 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2822 *DAG.getContext());
2823
2824 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2825 bool IsKernel = AMDGPU::isKernel(CallConv);
2826 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2827
2828 if (IsGraphics) {
2829 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2830 assert(!UserSGPRInfo.hasDispatchPtr() &&
2831 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2832 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2833 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2834 (void)UserSGPRInfo;
2835 if (!Subtarget->enableFlatScratch())
2836 assert(!UserSGPRInfo.hasFlatScratchInit());
2837 if ((CallConv != CallingConv::AMDGPU_CS &&
2838 CallConv != CallingConv::AMDGPU_Gfx) ||
2839 !Subtarget->hasArchitectedSGPRs())
2840 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2841 !Info->hasWorkGroupIDZ());
2842 }
2843
2844 if (CallConv == CallingConv::AMDGPU_PS) {
2845 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2846
2847 // At least one interpolation mode must be enabled or else the GPU will
2848 // hang.
2849 //
2850 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2851 // set PSInputAddr, the user wants to enable some bits after the compilation
2852 // based on run-time states. Since we can't know what the final PSInputEna
2853 // will look like, so we shouldn't do anything here and the user should take
2854 // responsibility for the correct programming.
2855 //
2856 // Otherwise, the following restrictions apply:
2857 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2858 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2859 // enabled too.
2860 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2861 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2862 CCInfo.AllocateReg(AMDGPU::VGPR0);
2863 CCInfo.AllocateReg(AMDGPU::VGPR1);
2864 Info->markPSInputAllocated(0);
2865 Info->markPSInputEnabled(0);
2866 }
2867 if (Subtarget->isAmdPalOS()) {
2868 // For isAmdPalOS, the user does not enable some bits after compilation
2869 // based on run-time states; the register values being generated here are
2870 // the final ones set in hardware. Therefore we need to apply the
2871 // workaround to PSInputAddr and PSInputEnable together. (The case where
2872 // a bit is set in PSInputAddr but not PSInputEnable is where the
2873 // frontend set up an input arg for a particular interpolation mode, but
2874 // nothing uses that input arg. Really we should have an earlier pass
2875 // that removes such an arg.)
2876 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2877 if ((PsInputBits & 0x7F) == 0 ||
2878 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2879 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2880 }
2881 } else if (IsKernel) {
2882 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2883 } else {
2884 Splits.append(Ins.begin(), Ins.end());
2885 }
2886
2887 if (IsKernel)
2888 analyzeFormalArgumentsCompute(CCInfo, Ins);
2889
2890 if (IsEntryFunc) {
2891 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2892 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2893 if (IsKernel && Subtarget->hasKernargPreload())
2894 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2895
2896 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2897 } else if (!IsGraphics) {
2898 // For the fixed ABI, pass workitem IDs in the last argument register.
2899 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2900
2901 // FIXME: Sink this into allocateSpecialInputSGPRs
2902 if (!Subtarget->enableFlatScratch())
2903 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2904
2905 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2906 }
2907
2908 if (!IsKernel) {
2909 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2910 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2911 }
2912
2914
2915 // FIXME: This is the minimum kernel argument alignment. We should improve
2916 // this to the maximum alignment of the arguments.
2917 //
2918 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2919 // kern arg offset.
2920 const Align KernelArgBaseAlign = Align(16);
2921
2922 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2923 const ISD::InputArg &Arg = Ins[i];
2924 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2925 InVals.push_back(DAG.getUNDEF(Arg.VT));
2926 continue;
2927 }
2928
2929 CCValAssign &VA = ArgLocs[ArgIdx++];
2930 MVT VT = VA.getLocVT();
2931
2932 if (IsEntryFunc && VA.isMemLoc()) {
2933 VT = Ins[i].VT;
2934 EVT MemVT = VA.getLocVT();
2935
2936 const uint64_t Offset = VA.getLocMemOffset();
2937 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2938
2939 if (Arg.Flags.isByRef()) {
2940 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2941
2942 const GCNTargetMachine &TM =
2943 static_cast<const GCNTargetMachine &>(getTargetMachine());
2944 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2945 Arg.Flags.getPointerAddrSpace())) {
2948 }
2949
2950 InVals.push_back(Ptr);
2951 continue;
2952 }
2953
2954 SDValue NewArg;
2955 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2956 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2957 // In this case the argument is packed into the previous preload SGPR.
2958 int64_t AlignDownOffset = alignDown(Offset, 4);
2959 int64_t OffsetDiff = Offset - AlignDownOffset;
2960 EVT IntVT = MemVT.changeTypeToInteger();
2961
2965 Register Reg =
2966 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2967
2968 assert(Reg);
2969 Register VReg = MRI.getLiveInVirtReg(Reg);
2970 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2971
2972 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2973 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2974
2975 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2976 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2977 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2978 Ins[i].Flags.isSExt(), &Ins[i]);
2979
2980 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2981 } else {
2985 const SmallVectorImpl<MCRegister> &PreloadRegs =
2986 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2987
2988 SDValue Copy;
2989 if (PreloadRegs.size() == 1) {
2990 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2991 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2992 NewArg = DAG.getCopyFromReg(
2993 Chain, DL, VReg,
2995 TRI->getRegSizeInBits(*RC)));
2996
2997 } else {
2998 // If the kernarg alignment does not match the alignment of the SGPR
2999 // tuple RC that can accommodate this argument, it will be built up
3000 // via copies from from the individual SGPRs that the argument was
3001 // preloaded to.
3003 for (auto Reg : PreloadRegs) {
3004 Register VReg = MRI.getLiveInVirtReg(Reg);
3005 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3006 Elts.push_back(Copy);
3007 }
3008 NewArg =
3009 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3010 PreloadRegs.size()),
3011 DL, Elts);
3012 }
3013
3014 // If the argument was preloaded to multiple consecutive 32-bit
3015 // registers because of misalignment between addressable SGPR tuples
3016 // and the argument size, we can still assume that because of kernarg
3017 // segment alignment restrictions that NewArg's size is the same as
3018 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3019 // truncate since we cannot preload to less than a single SGPR and the
3020 // MemVT may be smaller.
3021 EVT MemVTInt =
3023 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3024 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3025
3026 NewArg = DAG.getBitcast(MemVT, NewArg);
3027 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3028 Ins[i].Flags.isSExt(), &Ins[i]);
3029 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3030 }
3031 } else {
3032 // Hidden arguments that are in the kernel signature must be preloaded
3033 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3034 // the argument list and is not preloaded.
3035 if (Arg.isOrigArg()) {
3036 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3037 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3038 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3039 *OrigArg->getParent(),
3040 "hidden argument in kernel signature was not preloaded",
3041 DL.getDebugLoc());
3042 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3043 }
3044 }
3045
3046 NewArg =
3047 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3048 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3049 }
3050 Chains.push_back(NewArg.getValue(1));
3051
3052 auto *ParamTy =
3053 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3055 ParamTy &&
3056 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3057 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3058 // On SI local pointers are just offsets into LDS, so they are always
3059 // less than 16-bits. On CI and newer they could potentially be
3060 // real pointers, so we can't guarantee their size.
3061 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3062 DAG.getValueType(MVT::i16));
3063 }
3064
3065 InVals.push_back(NewArg);
3066 continue;
3067 }
3068 if (!IsEntryFunc && VA.isMemLoc()) {
3069 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3070 InVals.push_back(Val);
3071 if (!Arg.Flags.isByVal())
3072 Chains.push_back(Val.getValue(1));
3073 continue;
3074 }
3075
3076 assert(VA.isRegLoc() && "Parameter must be in a register!");
3077
3078 Register Reg = VA.getLocReg();
3079 const TargetRegisterClass *RC = nullptr;
3080 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3081 RC = &AMDGPU::VGPR_32RegClass;
3082 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3083 RC = &AMDGPU::SGPR_32RegClass;
3084 else
3085 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3086 EVT ValVT = VA.getValVT();
3087
3088 Reg = MF.addLiveIn(Reg, RC);
3089 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3090
3091 if (Arg.Flags.isSRet()) {
3092 // The return object should be reasonably addressable.
3093
3094 // FIXME: This helps when the return is a real sret. If it is a
3095 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3096 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3097 unsigned NumBits =
3099 Val = DAG.getNode(
3100 ISD::AssertZext, DL, VT, Val,
3101 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3102 }
3103
3104 // If this is an 8 or 16-bit value, it is really passed promoted
3105 // to 32 bits. Insert an assert[sz]ext to capture this, then
3106 // truncate to the right size.
3107 switch (VA.getLocInfo()) {
3108 case CCValAssign::Full:
3109 break;
3110 case CCValAssign::BCvt:
3111 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3112 break;
3113 case CCValAssign::SExt:
3114 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3115 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3116 break;
3117 case CCValAssign::ZExt:
3118 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3119 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3120 break;
3121 case CCValAssign::AExt:
3122 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3123 break;
3124 default:
3125 llvm_unreachable("Unknown loc info!");
3126 }
3127
3128 InVals.push_back(Val);
3129 }
3130
3131 // Start adding system SGPRs.
3132 if (IsEntryFunc)
3133 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3134
3135 // DAG.getPass() returns nullptr when using new pass manager.
3136 // TODO: Use DAG.getMFAM() to access analysis result.
3137 if (DAG.getPass()) {
3138 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3139 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3140 }
3141
3142 unsigned StackArgSize = CCInfo.getStackSize();
3143 Info->setBytesInStackArgArea(StackArgSize);
3144
3145 return Chains.empty() ? Chain
3146 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3147}
3148
3149// TODO: If return values can't fit in registers, we should return as many as
3150// possible in registers before passing on stack.
3152 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3153 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3154 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3155 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3156 // for shaders. Vector types should be explicitly handled by CC.
3157 if (AMDGPU::isEntryFunctionCC(CallConv))
3158 return true;
3159
3161 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3162 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3163 return false;
3164
3165 // We must use the stack if return would require unavailable registers.
3166 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3167 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3168 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3169 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3170 return false;
3171
3172 return true;
3173}
3174
3175SDValue
3177 bool isVarArg,
3179 const SmallVectorImpl<SDValue> &OutVals,
3180 const SDLoc &DL, SelectionDAG &DAG) const {
3183
3184 if (AMDGPU::isKernel(CallConv)) {
3185 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3186 OutVals, DL, DAG);
3187 }
3188
3189 bool IsShader = AMDGPU::isShader(CallConv);
3190
3191 Info->setIfReturnsVoid(Outs.empty());
3192 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3193
3194 // CCValAssign - represent the assignment of the return value to a location.
3197
3198 // CCState - Info about the registers and stack slots.
3199 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3200 *DAG.getContext());
3201
3202 // Analyze outgoing return values.
3203 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3204
3205 SDValue Glue;
3207 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3208
3209 // Copy the result values into the output registers.
3210 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3211 ++I, ++RealRVLocIdx) {
3212 CCValAssign &VA = RVLocs[I];
3213 assert(VA.isRegLoc() && "Can only return in registers!");
3214 // TODO: Partially return in registers if return values don't fit.
3215 SDValue Arg = OutVals[RealRVLocIdx];
3216
3217 // Copied from other backends.
3218 switch (VA.getLocInfo()) {
3219 case CCValAssign::Full:
3220 break;
3221 case CCValAssign::BCvt:
3222 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3223 break;
3224 case CCValAssign::SExt:
3225 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3226 break;
3227 case CCValAssign::ZExt:
3228 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3229 break;
3230 case CCValAssign::AExt:
3231 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3232 break;
3233 default:
3234 llvm_unreachable("Unknown loc info!");
3235 }
3236
3237 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3238 Glue = Chain.getValue(1);
3239 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3240 }
3241
3242 // FIXME: Does sret work properly?
3243 if (!Info->isEntryFunction()) {
3244 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3245 const MCPhysReg *I =
3246 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3247 if (I) {
3248 for (; *I; ++I) {
3249 if (AMDGPU::SReg_64RegClass.contains(*I))
3250 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3251 else if (AMDGPU::SReg_32RegClass.contains(*I))
3252 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3253 else
3254 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3255 }
3256 }
3257 }
3258
3259 // Update chain and glue.
3260 RetOps[0] = Chain;
3261 if (Glue.getNode())
3262 RetOps.push_back(Glue);
3263
3264 unsigned Opc = AMDGPUISD::ENDPGM;
3265 if (!IsWaveEnd)
3267 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3268}
3269
3271 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3272 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3273 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3274 SDValue ThisVal) const {
3275 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3276
3277 // Assign locations to each value returned by this call.
3279 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3280 *DAG.getContext());
3281 CCInfo.AnalyzeCallResult(Ins, RetCC);
3282
3283 // Copy all of the result registers out of their specified physreg.
3284 for (CCValAssign VA : RVLocs) {
3285 SDValue Val;
3286
3287 if (VA.isRegLoc()) {
3288 Val =
3289 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3290 Chain = Val.getValue(1);
3291 InGlue = Val.getValue(2);
3292 } else if (VA.isMemLoc()) {
3293 report_fatal_error("TODO: return values in memory");
3294 } else
3295 llvm_unreachable("unknown argument location type");
3296
3297 switch (VA.getLocInfo()) {
3298 case CCValAssign::Full:
3299 break;
3300 case CCValAssign::BCvt:
3301 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3302 break;
3303 case CCValAssign::ZExt:
3304 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3305 DAG.getValueType(VA.getValVT()));
3306 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3307 break;
3308 case CCValAssign::SExt:
3309 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3310 DAG.getValueType(VA.getValVT()));
3311 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3312 break;
3313 case CCValAssign::AExt:
3314 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3315 break;
3316 default:
3317 llvm_unreachable("Unknown loc info!");
3318 }
3319
3320 InVals.push_back(Val);
3321 }
3322
3323 return Chain;
3324}
3325
3326// Add code to pass special inputs required depending on used features separate
3327// from the explicit user arguments present in the IR.
3329 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3330 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3331 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3332 // If we don't have a call site, this was a call inserted by
3333 // legalization. These can never use special inputs.
3334 if (!CLI.CB)
3335 return;
3336
3337 SelectionDAG &DAG = CLI.DAG;
3338 const SDLoc &DL = CLI.DL;
3339 const Function &F = DAG.getMachineFunction().getFunction();
3340
3341 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3342 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3343
3344 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3346 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3347 // DAG.getPass() returns nullptr when using new pass manager.
3348 // TODO: Use DAG.getMFAM() to access analysis result.
3349 if (DAG.getPass()) {
3350 auto &ArgUsageInfo =
3352 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3353 }
3354 }
3355
3356 // TODO: Unify with private memory register handling. This is complicated by
3357 // the fact that at least in kernels, the input argument is not necessarily
3358 // in the same location as the input.
3359 // clang-format off
3360 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3362 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3363 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3364 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3365 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3366 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3367 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3368 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3369 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3370 };
3371 // clang-format on
3372
3373 for (auto [InputID, Attr] : ImplicitAttrs) {
3374 // If the callee does not use the attribute value, skip copying the value.
3375 if (CLI.CB->hasFnAttr(Attr))
3376 continue;
3377
3378 const auto [OutgoingArg, ArgRC, ArgTy] =
3379 CalleeArgInfo->getPreloadedValue(InputID);
3380 if (!OutgoingArg)
3381 continue;
3382
3383 const auto [IncomingArg, IncomingArgRC, Ty] =
3384 CallerArgInfo.getPreloadedValue(InputID);
3385 assert(IncomingArgRC == ArgRC);
3386
3387 // All special arguments are ints for now.
3388 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3389 SDValue InputReg;
3390
3391 if (IncomingArg) {
3392 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3393 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3394 // The implicit arg ptr is special because it doesn't have a corresponding
3395 // input for kernels, and is computed from the kernarg segment pointer.
3396 InputReg = getImplicitArgPtr(DAG, DL);
3397 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3398 std::optional<uint32_t> Id =
3400 if (Id.has_value()) {
3401 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3402 } else {
3403 InputReg = DAG.getUNDEF(ArgVT);
3404 }
3405 } else {
3406 // We may have proven the input wasn't needed, although the ABI is
3407 // requiring it. We just need to allocate the register appropriately.
3408 InputReg = DAG.getUNDEF(ArgVT);
3409 }
3410
3411 if (OutgoingArg->isRegister()) {
3412 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3413 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3414 report_fatal_error("failed to allocate implicit input argument");
3415 } else {
3416 unsigned SpecialArgOffset =
3417 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3418 SDValue ArgStore =
3419 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3420 MemOpChains.push_back(ArgStore);
3421 }
3422 }
3423
3424 // Pack workitem IDs into a single register or pass it as is if already
3425 // packed.
3426
3427 auto [OutgoingArg, ArgRC, Ty] =
3429 if (!OutgoingArg)
3430 std::tie(OutgoingArg, ArgRC, Ty) =
3432 if (!OutgoingArg)
3433 std::tie(OutgoingArg, ArgRC, Ty) =
3435 if (!OutgoingArg)
3436 return;
3437
3438 const ArgDescriptor *IncomingArgX = std::get<0>(
3440 const ArgDescriptor *IncomingArgY = std::get<0>(
3442 const ArgDescriptor *IncomingArgZ = std::get<0>(
3444
3445 SDValue InputReg;
3446 SDLoc SL;
3447
3448 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3449 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3450 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3451
3452 // If incoming ids are not packed we need to pack them.
3453 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3454 NeedWorkItemIDX) {
3455 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3456 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3457 } else {
3458 InputReg = DAG.getConstant(0, DL, MVT::i32);
3459 }
3460 }
3461
3462 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3463 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3464 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3465 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3466 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3467 InputReg = InputReg.getNode()
3468 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3469 : Y;
3470 }
3471
3472 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3473 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3474 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3475 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3476 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3477 InputReg = InputReg.getNode()
3478 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3479 : Z;
3480 }
3481
3482 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3483 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3484 // We're in a situation where the outgoing function requires the workitem
3485 // ID, but the calling function does not have it (e.g a graphics function
3486 // calling a C calling convention function). This is illegal, but we need
3487 // to produce something.
3488 InputReg = DAG.getUNDEF(MVT::i32);
3489 } else {
3490 // Workitem ids are already packed, any of present incoming arguments
3491 // will carry all required fields.
3492 ArgDescriptor IncomingArg =
3493 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3494 : IncomingArgY ? *IncomingArgY
3495 : *IncomingArgZ,
3496 ~0u);
3497 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3498 }
3499 }
3500
3501 if (OutgoingArg->isRegister()) {
3502 if (InputReg)
3503 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3504
3505 CCInfo.AllocateReg(OutgoingArg->getRegister());
3506 } else {
3507 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3508 if (InputReg) {
3509 SDValue ArgStore =
3510 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3511 MemOpChains.push_back(ArgStore);
3512 }
3513 }
3514}
3515
3517 return CC == CallingConv::Fast;
3518}
3519
3520/// Return true if we might ever do TCO for calls with this calling convention.
3522 switch (CC) {
3523 case CallingConv::C:
3525 return true;
3526 default:
3527 return canGuaranteeTCO(CC);
3528 }
3529}
3530
3532 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3534 const SmallVectorImpl<SDValue> &OutVals,
3535 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3536 if (AMDGPU::isChainCC(CalleeCC))
3537 return true;
3538
3539 if (!mayTailCallThisCC(CalleeCC))
3540 return false;
3541
3542 // For a divergent call target, we need to do a waterfall loop over the
3543 // possible callees which precludes us from using a simple jump.
3544 if (Callee->isDivergent())
3545 return false;
3546
3548 const Function &CallerF = MF.getFunction();
3549 CallingConv::ID CallerCC = CallerF.getCallingConv();
3551 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3552
3553 // Kernels aren't callable, and don't have a live in return address so it
3554 // doesn't make sense to do a tail call with entry functions.
3555 if (!CallerPreserved)
3556 return false;
3557
3558 bool CCMatch = CallerCC == CalleeCC;
3559
3561 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3562 return true;
3563 return false;
3564 }
3565
3566 // TODO: Can we handle var args?
3567 if (IsVarArg)
3568 return false;
3569
3570 for (const Argument &Arg : CallerF.args()) {
3571 if (Arg.hasByValAttr())
3572 return false;
3573 }
3574
3575 LLVMContext &Ctx = *DAG.getContext();
3576
3577 // Check that the call results are passed in the same way.
3578 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3579 CCAssignFnForCall(CalleeCC, IsVarArg),
3580 CCAssignFnForCall(CallerCC, IsVarArg)))
3581 return false;
3582
3583 // The callee has to preserve all registers the caller needs to preserve.
3584 if (!CCMatch) {
3585 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3586 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3587 return false;
3588 }
3589
3590 // Nothing more to check if the callee is taking no arguments.
3591 if (Outs.empty())
3592 return true;
3593
3595 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3596
3597 // FIXME: We are not allocating special input registers, so we will be
3598 // deciding based on incorrect register assignments.
3599 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3600
3601 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3602 // If the stack arguments for this call do not fit into our own save area then
3603 // the call cannot be made tail.
3604 // TODO: Is this really necessary?
3605 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3606 return false;
3607
3608 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3609 // FIXME: What about inreg arguments that end up passed in memory?
3610 if (!CCVA.isRegLoc())
3611 continue;
3612
3613 // If we are passing an argument in an SGPR, and the value is divergent,
3614 // this call requires a waterfall loop.
3615 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3616 LLVM_DEBUG(
3617 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3618 << printReg(CCVA.getLocReg(), TRI) << '\n');
3619 return false;
3620 }
3621 }
3622
3623 const MachineRegisterInfo &MRI = MF.getRegInfo();
3624 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3625}
3626
3628 if (!CI->isTailCall())
3629 return false;
3630
3631 const Function *ParentFn = CI->getParent()->getParent();
3633 return false;
3634 return true;
3635}
3636
3637// The wave scratch offset register is used as the global base pointer.
3639 SmallVectorImpl<SDValue> &InVals) const {
3640 CallingConv::ID CallConv = CLI.CallConv;
3641 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3642
3643 SelectionDAG &DAG = CLI.DAG;
3644
3645 TargetLowering::ArgListEntry RequestedExec;
3646 if (IsChainCallConv) {
3647 // The last argument should be the value that we need to put in EXEC.
3648 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3649 // don't treat it like the rest of the arguments.
3650 RequestedExec = CLI.Args.back();
3651 assert(RequestedExec.Node && "No node for EXEC");
3652
3653 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3654 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3655
3656 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3657 CLI.Outs.pop_back();
3658 CLI.OutVals.pop_back();
3659
3660 if (RequestedExec.Ty->isIntegerTy(64)) {
3661 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3662 CLI.Outs.pop_back();
3663 CLI.OutVals.pop_back();
3664 }
3665
3666 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3667 "Haven't popped all the pieces of the EXEC mask");
3668 }
3669
3670 const SDLoc &DL = CLI.DL;
3672 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3674 SDValue Chain = CLI.Chain;
3675 SDValue Callee = CLI.Callee;
3676 bool &IsTailCall = CLI.IsTailCall;
3677 bool IsVarArg = CLI.IsVarArg;
3678 bool IsSibCall = false;
3680
3681 if (Callee.isUndef() || isNullConstant(Callee)) {
3682 if (!CLI.IsTailCall) {
3683 for (ISD::InputArg &Arg : CLI.Ins)
3684 InVals.push_back(DAG.getUNDEF(Arg.VT));
3685 }
3686
3687 return Chain;
3688 }
3689
3690 if (IsVarArg) {
3691 return lowerUnhandledCall(CLI, InVals,
3692 "unsupported call to variadic function ");
3693 }
3694
3695 if (!CLI.CB)
3696 report_fatal_error("unsupported libcall legalization");
3697
3698 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3699 return lowerUnhandledCall(CLI, InVals,
3700 "unsupported required tail call to function ");
3701 }
3702
3703 if (IsTailCall) {
3704 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3705 Outs, OutVals, Ins, DAG);
3706 if (!IsTailCall &&
3707 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3708 report_fatal_error("failed to perform tail call elimination on a call "
3709 "site marked musttail or on llvm.amdgcn.cs.chain");
3710 }
3711
3712 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3713
3714 // A sibling call is one where we're under the usual C ABI and not planning
3715 // to change that but can still do a tail call:
3716 if (!TailCallOpt && IsTailCall)
3717 IsSibCall = true;
3718
3719 if (IsTailCall)
3720 ++NumTailCalls;
3721 }
3722
3725 SmallVector<SDValue, 8> MemOpChains;
3726
3727 // Analyze operands of the call, assigning locations to each operand.
3729 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3730 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3731
3732 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3733 // With a fixed ABI, allocate fixed registers before user arguments.
3734 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3735 }
3736
3737 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3738
3739 // Get a count of how many bytes are to be pushed on the stack.
3740 unsigned NumBytes = CCInfo.getStackSize();
3741
3742 if (IsSibCall) {
3743 // Since we're not changing the ABI to make this a tail call, the memory
3744 // operands are already available in the caller's incoming argument space.
3745 NumBytes = 0;
3746 }
3747
3748 // FPDiff is the byte offset of the call's argument area from the callee's.
3749 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3750 // by this amount for a tail call. In a sibling call it must be 0 because the
3751 // caller will deallocate the entire stack and the callee still expects its
3752 // arguments to begin at SP+0. Completely unused for non-tail calls.
3753 int32_t FPDiff = 0;
3754 MachineFrameInfo &MFI = MF.getFrameInfo();
3755 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3756
3757 // Adjust the stack pointer for the new arguments...
3758 // These operations are automatically eliminated by the prolog/epilog pass
3759 if (!IsSibCall)
3760 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3761
3762 if (!IsSibCall || IsChainCallConv) {
3763 if (!Subtarget->enableFlatScratch()) {
3764 SmallVector<SDValue, 4> CopyFromChains;
3765
3766 // In the HSA case, this should be an identity copy.
3767 SDValue ScratchRSrcReg =
3768 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3769 RegsToPass.emplace_back(IsChainCallConv
3770 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3771 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3772 ScratchRSrcReg);
3773 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3774 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3775 }
3776 }
3777
3778 const unsigned NumSpecialInputs = RegsToPass.size();
3779
3780 MVT PtrVT = MVT::i32;
3781
3782 // Walk the register/memloc assignments, inserting copies/loads.
3783 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3784 CCValAssign &VA = ArgLocs[i];
3785 SDValue Arg = OutVals[i];
3786
3787 // Promote the value if needed.
3788 switch (VA.getLocInfo()) {
3789 case CCValAssign::Full:
3790 break;
3791 case CCValAssign::BCvt:
3792 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3793 break;
3794 case CCValAssign::ZExt:
3795 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3796 break;
3797 case CCValAssign::SExt:
3798 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3799 break;
3800 case CCValAssign::AExt:
3801 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3802 break;
3803 case CCValAssign::FPExt:
3804 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3805 break;
3806 default:
3807 llvm_unreachable("Unknown loc info!");
3808 }
3809
3810 if (VA.isRegLoc()) {
3811 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3812 } else {
3813 assert(VA.isMemLoc());
3814
3815 SDValue DstAddr;
3816 MachinePointerInfo DstInfo;
3817
3818 unsigned LocMemOffset = VA.getLocMemOffset();
3819 int32_t Offset = LocMemOffset;
3820
3821 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3822 MaybeAlign Alignment;
3823
3824 if (IsTailCall) {
3825 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3826 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3827 : VA.getValVT().getStoreSize();
3828
3829 // FIXME: We can have better than the minimum byval required alignment.
3830 Alignment =
3831 Flags.isByVal()
3832 ? Flags.getNonZeroByValAlign()
3833 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3834
3835 Offset = Offset + FPDiff;
3836 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3837
3838 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3839 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3840
3841 // Make sure any stack arguments overlapping with where we're storing
3842 // are loaded before this eventual operation. Otherwise they'll be
3843 // clobbered.
3844
3845 // FIXME: Why is this really necessary? This seems to just result in a
3846 // lot of code to copy the stack and write them back to the same
3847 // locations, which are supposed to be immutable?
3848 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3849 } else {
3850 // Stores to the argument stack area are relative to the stack pointer.
3851 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3852 MVT::i32);
3853 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3854 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3855 Alignment =
3856 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3857 }
3858
3859 if (Outs[i].Flags.isByVal()) {
3860 SDValue SizeNode =
3861 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3862 SDValue Cpy =
3863 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3864 Outs[i].Flags.getNonZeroByValAlign(),
3865 /*isVol = */ false, /*AlwaysInline = */ true,
3866 /*CI=*/nullptr, std::nullopt, DstInfo,
3868
3869 MemOpChains.push_back(Cpy);
3870 } else {
3871 SDValue Store =
3872 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3873 MemOpChains.push_back(Store);
3874 }
3875 }
3876 }
3877
3878 if (!MemOpChains.empty())
3879 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3880
3881 SDValue ReadFirstLaneID =
3882 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3883
3884 SDValue TokenGlue;
3885 if (CLI.ConvergenceControlToken) {
3886 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3888 }
3889
3890 // Build a sequence of copy-to-reg nodes chained together with token chain
3891 // and flag operands which copy the outgoing args into the appropriate regs.
3892 SDValue InGlue;
3893
3894 unsigned ArgIdx = 0;
3895 for (auto [Reg, Val] : RegsToPass) {
3896 if (ArgIdx++ >= NumSpecialInputs &&
3897 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3898 // For chain calls, the inreg arguments are required to be
3899 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3900 // they are uniform.
3901 //
3902 // For other calls, if an inreg arguments is known to be uniform,
3903 // speculatively insert a readfirstlane in case it is in a VGPR.
3904 //
3905 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3906 // value, so let that continue to produce invalid code.
3907
3908 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3909 if (TokenGlue)
3910 ReadfirstlaneArgs.push_back(TokenGlue);
3912 ReadfirstlaneArgs);
3913 }
3914
3915 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3916 InGlue = Chain.getValue(1);
3917 }
3918
3919 // We don't usually want to end the call-sequence here because we would tidy
3920 // the frame up *after* the call, however in the ABI-changing tail-call case
3921 // we've carefully laid out the parameters so that when sp is reset they'll be
3922 // in the correct location.
3923 if (IsTailCall && !IsSibCall) {
3924 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3925 InGlue = Chain.getValue(1);
3926 }
3927
3928 std::vector<SDValue> Ops({Chain});
3929
3930 // Add a redundant copy of the callee global which will not be legalized, as
3931 // we need direct access to the callee later.
3932 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3933 const GlobalValue *GV = GSD->getGlobal();
3934 Ops.push_back(Callee);
3935 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3936 } else {
3937 if (IsTailCall) {
3938 // isEligibleForTailCallOptimization considered whether the call target is
3939 // divergent, but we may still end up with a uniform value in a VGPR.
3940 // Insert a readfirstlane just in case.
3941 SDValue ReadFirstLaneID =
3942 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3943
3944 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3945 if (TokenGlue)
3946 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3947 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3948 ReadfirstlaneArgs);
3949 }
3950
3951 Ops.push_back(Callee);
3952 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3953 }
3954
3955 if (IsTailCall) {
3956 // Each tail call may have to adjust the stack by a different amount, so
3957 // this information must travel along with the operation for eventual
3958 // consumption by emitEpilogue.
3959 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3960 }
3961
3962 if (IsChainCallConv)
3963 Ops.push_back(RequestedExec.Node);
3964
3965 // Add argument registers to the end of the list so that they are known live
3966 // into the call.
3967 for (auto &[Reg, Val] : RegsToPass)
3968 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3969
3970 // Add a register mask operand representing the call-preserved registers.
3971 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3972 assert(Mask && "Missing call preserved mask for calling convention");
3973 Ops.push_back(DAG.getRegisterMask(Mask));
3974
3975 if (SDValue Token = CLI.ConvergenceControlToken) {
3977 GlueOps.push_back(Token);
3978 if (InGlue)
3979 GlueOps.push_back(InGlue);
3980
3981 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3982 MVT::Glue, GlueOps),
3983 0);
3984 }
3985
3986 if (InGlue)
3987 Ops.push_back(InGlue);
3988
3989 // If we're doing a tall call, use a TC_RETURN here rather than an
3990 // actual call instruction.
3991 if (IsTailCall) {
3992 MFI.setHasTailCall();
3993 unsigned OPC = AMDGPUISD::TC_RETURN;
3994 switch (CallConv) {
3997 break;
4001 break;
4002 }
4003
4004 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4005 }
4006
4007 // Returns a chain and a flag for retval copy to use.
4008 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4009 Chain = Call.getValue(0);
4010 InGlue = Call.getValue(1);
4011
4012 uint64_t CalleePopBytes = NumBytes;
4013 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4014 if (!Ins.empty())
4015 InGlue = Chain.getValue(1);
4016
4017 // Handle result values, copying them out of physregs into vregs that we
4018 // return.
4019 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4020 InVals, /*IsThisReturn=*/false, SDValue());
4021}
4022
4023// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4024// except for:
4025// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4026// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4028 SelectionDAG &DAG) const {
4029 const MachineFunction &MF = DAG.getMachineFunction();
4031
4032 SDLoc dl(Op);
4033 EVT VT = Op.getValueType();
4034 SDValue Chain = Op.getOperand(0);
4035 Register SPReg = Info->getStackPtrOffsetReg();
4036
4037 // Chain the dynamic stack allocation so that it doesn't modify the stack
4038 // pointer when other instructions are using the stack.
4039 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4040
4041 SDValue Size = Op.getOperand(1);
4042 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4043 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4044
4045 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4047 "Stack grows upwards for AMDGPU");
4048
4049 Chain = BaseAddr.getValue(1);
4050 Align StackAlign = TFL->getStackAlign();
4051 if (Alignment > StackAlign) {
4052 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4053 << Subtarget->getWavefrontSizeLog2();
4054 uint64_t StackAlignMask = ScaledAlignment - 1;
4055 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4056 DAG.getConstant(StackAlignMask, dl, VT));
4057 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4058 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4059 }
4060
4061 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4062 SDValue NewSP;
4063 if (isa<ConstantSDNode>(Size)) {
4064 // For constant sized alloca, scale alloca size by wave-size
4065 SDValue ScaledSize = DAG.getNode(
4066 ISD::SHL, dl, VT, Size,
4067 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4068 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4069 } else {
4070 // For dynamic sized alloca, perform wave-wide reduction to get max of
4071 // alloca size(divergent) and then scale it by wave-size
4072 SDValue WaveReduction =
4073 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4074 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4075 Size, DAG.getConstant(0, dl, MVT::i32));
4076 SDValue ScaledSize = DAG.getNode(
4077 ISD::SHL, dl, VT, Size,
4078 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4079 NewSP =
4080 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4081 SDValue ReadFirstLaneID =
4082 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4083 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4084 NewSP);
4085 }
4086
4087 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4088 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4089
4090 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4091}
4092
4094 if (Op.getValueType() != MVT::i32)
4095 return Op; // Defer to cannot select error.
4096
4098 SDLoc SL(Op);
4099
4100 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4101
4102 // Convert from wave uniform to swizzled vector address. This should protect
4103 // from any edge cases where the stacksave result isn't directly used with
4104 // stackrestore.
4105 SDValue VectorAddress =
4106 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4107 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4108}
4109
4111 SelectionDAG &DAG) const {
4112 SDLoc SL(Op);
4113 assert(Op.getValueType() == MVT::i32);
4114
4115 uint32_t BothRoundHwReg =
4117 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4118
4119 SDValue IntrinID =
4120 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4121 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4122 Op.getOperand(0), IntrinID, GetRoundBothImm);
4123
4124 // There are two rounding modes, one for f32 and one for f64/f16. We only
4125 // report in the standard value range if both are the same.
4126 //
4127 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4128 // ties away from zero is not supported, and the other values are rotated by
4129 // 1.
4130 //
4131 // If the two rounding modes are not the same, report a target defined value.
4132
4133 // Mode register rounding mode fields:
4134 //
4135 // [1:0] Single-precision round mode.
4136 // [3:2] Double/Half-precision round mode.
4137 //
4138 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4139 //
4140 // Hardware Spec
4141 // Toward-0 3 0
4142 // Nearest Even 0 1
4143 // +Inf 1 2
4144 // -Inf 2 3
4145 // NearestAway0 N/A 4
4146 //
4147 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4148 // table we can index by the raw hardware mode.
4149 //
4150 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4151
4152 SDValue BitTable =
4154
4155 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4156 SDValue RoundModeTimesNumBits =
4157 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4158
4159 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4160 // knew only one mode was demanded.
4161 SDValue TableValue =
4162 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4163 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4164
4165 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4166 SDValue TableEntry =
4167 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4168
4169 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4170 // if it's an extended value.
4171 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4172 SDValue IsStandardValue =
4173 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4174 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4175 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4176 TableEntry, EnumOffset);
4177
4178 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4179}
4180
4182 SelectionDAG &DAG) const {
4183 SDLoc SL(Op);
4184
4185 SDValue NewMode = Op.getOperand(1);
4186 assert(NewMode.getValueType() == MVT::i32);
4187
4188 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4189 // hardware MODE.fp_round values.
4190 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4191 uint32_t ClampedVal = std::min(
4192 static_cast<uint32_t>(ConstMode->getZExtValue()),
4194 NewMode = DAG.getConstant(
4195 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4196 } else {
4197 // If we know the input can only be one of the supported standard modes in
4198 // the range 0-3, we can use a simplified mapping to hardware values.
4199 KnownBits KB = DAG.computeKnownBits(NewMode);
4200 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4201 // The supported standard values are 0-3. The extended values start at 8. We
4202 // need to offset by 4 if the value is in the extended range.
4203
4204 if (UseReducedTable) {
4205 // Truncate to the low 32-bits.
4206 SDValue BitTable = DAG.getConstant(
4207 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4208
4209 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4210 SDValue RoundModeTimesNumBits =
4211 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4212
4213 NewMode =
4214 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4215
4216 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4217 // the table extracted bits into inline immediates.
4218 } else {
4219 // table_index = umin(value, value - 4)
4220 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4221 SDValue BitTable =
4223
4224 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4225 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4226 SDValue IndexVal =
4227 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4228
4229 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4230 SDValue RoundModeTimesNumBits =
4231 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4232
4233 SDValue TableValue =
4234 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4235 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4236
4237 // No need to mask out the high bits since the setreg will ignore them
4238 // anyway.
4239 NewMode = TruncTable;
4240 }
4241
4242 // Insert a readfirstlane in case the value is a VGPR. We could do this
4243 // earlier and keep more operations scalar, but that interferes with
4244 // combining the source.
4245 SDValue ReadFirstLaneID =
4246 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4247 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4248 ReadFirstLaneID, NewMode);
4249 }
4250
4251 // N.B. The setreg will be later folded into s_round_mode on supported
4252 // targets.
4253 SDValue IntrinID =
4254 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4255 uint32_t BothRoundHwReg =
4257 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4258
4259 SDValue SetReg =
4260 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4261 IntrinID, RoundBothImm, NewMode);
4262
4263 return SetReg;
4264}
4265
4267 if (Op->isDivergent())
4268 return SDValue();
4269
4270 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4275 break;
4276 default:
4277 return SDValue();
4278 }
4279
4280 return Op;
4281}
4282
4283// Work around DAG legality rules only based on the result type.
4285 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4286 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4287 EVT SrcVT = Src.getValueType();
4288
4289 if (SrcVT.getScalarType() != MVT::bf16)
4290 return Op;
4291
4292 SDLoc SL(Op);
4293 SDValue BitCast =
4294 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4295
4296 EVT DstVT = Op.getValueType();
4297 if (IsStrict)
4298 llvm_unreachable("Need STRICT_BF16_TO_FP");
4299
4300 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4301}
4302
4304 SDLoc SL(Op);
4305 if (Op.getValueType() != MVT::i64)
4306 return Op;
4307
4308 uint32_t ModeHwReg =
4310 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4311 uint32_t TrapHwReg =
4313 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4314
4315 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4316 SDValue IntrinID =
4317 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4318 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4319 Op.getOperand(0), IntrinID, ModeHwRegImm);
4320 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4321 Op.getOperand(0), IntrinID, TrapHwRegImm);
4322 SDValue TokenReg =
4323 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4324 GetTrapReg.getValue(1));
4325
4326 SDValue CvtPtr =
4327 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4328 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4329
4330 return DAG.getMergeValues({Result, TokenReg}, SL);
4331}
4332
4334 SDLoc SL(Op);
4335 if (Op.getOperand(1).getValueType() != MVT::i64)
4336 return Op;
4337
4338 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4339 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4340 DAG.getConstant(0, SL, MVT::i32));
4341 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4342 DAG.getConstant(1, SL, MVT::i32));
4343
4344 SDValue ReadFirstLaneID =
4345 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4346 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4347 ReadFirstLaneID, NewModeReg);
4348 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4349 ReadFirstLaneID, NewTrapReg);
4350
4351 unsigned ModeHwReg =
4353 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4354 unsigned TrapHwReg =
4356 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4357
4358 SDValue IntrinID =
4359 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4360 SDValue SetModeReg =
4361 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4362 IntrinID, ModeHwRegImm, NewModeReg);
4363 SDValue SetTrapReg =
4364 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4365 IntrinID, TrapHwRegImm, NewTrapReg);
4366 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4367}
4368
4370 const MachineFunction &MF) const {
4372 .Case("m0", AMDGPU::M0)
4373 .Case("exec", AMDGPU::EXEC)
4374 .Case("exec_lo", AMDGPU::EXEC_LO)
4375 .Case("exec_hi", AMDGPU::EXEC_HI)
4376 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4377 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4378 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4379 .Default(Register());
4380
4381 if (Reg == AMDGPU::NoRegister) {
4383 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4384 }
4385
4386 if (!Subtarget->hasFlatScrRegister() &&
4387 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4388 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4389 "\" for subtarget."));
4390 }
4391
4392 switch (Reg) {
4393 case AMDGPU::M0:
4394 case AMDGPU::EXEC_LO:
4395 case AMDGPU::EXEC_HI:
4396 case AMDGPU::FLAT_SCR_LO:
4397 case AMDGPU::FLAT_SCR_HI:
4398 if (VT.getSizeInBits() == 32)
4399 return Reg;
4400 break;
4401 case AMDGPU::EXEC:
4402 case AMDGPU::FLAT_SCR:
4403 if (VT.getSizeInBits() == 64)
4404 return Reg;
4405 break;
4406 default:
4407 llvm_unreachable("missing register type checking");
4408 }
4409
4411 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4412}
4413
4414// If kill is not the last instruction, split the block so kill is always a
4415// proper terminator.
4418 MachineBasicBlock *BB) const {
4419 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4421 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4422 return SplitBB;
4423}
4424
4425// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4426// \p MI will be the only instruction in the loop body block. Otherwise, it will
4427// be the first instruction in the remainder block.
4428//
4429/// \returns { LoopBody, Remainder }
4430static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4434
4435 // To insert the loop we need to split the block. Move everything after this
4436 // point to a new block, and insert a new empty block between the two.
4438 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4440 ++MBBI;
4441
4442 MF->insert(MBBI, LoopBB);
4443 MF->insert(MBBI, RemainderBB);
4444
4445 LoopBB->addSuccessor(LoopBB);
4446 LoopBB->addSuccessor(RemainderBB);
4447
4448 // Move the rest of the block into a new block.
4449 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4450
4451 if (InstInLoop) {
4452 auto Next = std::next(I);
4453
4454 // Move instruction to loop body.
4455 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4456
4457 // Move the rest of the block.
4458 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4459 } else {
4460 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4461 }
4462
4463 MBB.addSuccessor(LoopBB);
4464
4465 return std::pair(LoopBB, RemainderBB);
4466}
4467
4468/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4470 MachineBasicBlock *MBB = MI.getParent();
4472 auto I = MI.getIterator();
4473 auto E = std::next(I);
4474
4475 // clang-format off
4476 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4477 .addImm(0);
4478 // clang-format on
4479
4480 MIBundleBuilder Bundler(*MBB, I, E);
4481 finalizeBundle(*MBB, Bundler.begin());
4482}
4483
4486 MachineBasicBlock *BB) const {
4487 const DebugLoc &DL = MI.getDebugLoc();
4488
4490
4492
4493 // Apparently kill flags are only valid if the def is in the same block?
4494 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4495 Src->setIsKill(false);
4496
4497 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4498
4499 MachineBasicBlock::iterator I = LoopBB->end();
4500
4501 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4503
4504 // Clear TRAP_STS.MEM_VIOL
4505 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4506 .addImm(0)
4507 .addImm(EncodedReg);
4508
4510
4511 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4512
4513 // Load and check TRAP_STS.MEM_VIOL
4514 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4515 .addImm(EncodedReg);
4516
4517 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4518 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4519 .addReg(Reg, RegState::Kill)
4520 .addImm(0);
4521 // clang-format off
4522 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4523 .addMBB(LoopBB);
4524 // clang-format on
4525
4526 return RemainderBB;
4527}
4528
4529// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4530// wavefront. If the value is uniform and just happens to be in a VGPR, this
4531// will only do one iteration. In the worst case, this will loop 64 times.
4532//
4533// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4536 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4537 const DebugLoc &DL, const MachineOperand &Idx,
4538 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4539 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4540 Register &SGPRIdxReg) {
4541
4542 MachineFunction *MF = OrigBB.getParent();
4543 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4544 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4546
4547 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4548 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4549 Register NewExec = MRI.createVirtualRegister(BoolRC);
4550 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4551 Register CondReg = MRI.createVirtualRegister(BoolRC);
4552
4553 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4554 .addReg(InitReg)
4555 .addMBB(&OrigBB)
4556 .addReg(ResultReg)
4557 .addMBB(&LoopBB);
4558
4559 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4560 .addReg(InitSaveExecReg)
4561 .addMBB(&OrigBB)
4562 .addReg(NewExec)
4563 .addMBB(&LoopBB);
4564
4565 // Read the next variant <- also loop target.
4566 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4567 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4568
4569 // Compare the just read M0 value to all possible Idx values.
4570 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4571 .addReg(CurrentIdxReg)
4572 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4573
4574 // Update EXEC, save the original EXEC value to VCC.
4575 BuildMI(LoopBB, I, DL,
4576 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4577 : AMDGPU::S_AND_SAVEEXEC_B64),
4578 NewExec)
4579 .addReg(CondReg, RegState::Kill);
4580
4581 MRI.setSimpleHint(NewExec, CondReg);
4582
4583 if (UseGPRIdxMode) {
4584 if (Offset == 0) {
4585 SGPRIdxReg = CurrentIdxReg;
4586 } else {
4587 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4588 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4589 .addReg(CurrentIdxReg, RegState::Kill)
4590 .addImm(Offset);
4591 }
4592 } else {
4593 // Move index from VCC into M0
4594 if (Offset == 0) {
4595 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4596 .addReg(CurrentIdxReg, RegState::Kill);
4597 } else {
4598 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4599 .addReg(CurrentIdxReg, RegState::Kill)
4600 .addImm(Offset);
4601 }
4602 }
4603
4604 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4605 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4606 MachineInstr *InsertPt =
4607 BuildMI(LoopBB, I, DL,
4608 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4609 : AMDGPU::S_XOR_B64_term),
4610 Exec)
4611 .addReg(Exec)
4612 .addReg(NewExec);
4613
4614 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4615 // s_cbranch_scc0?
4616
4617 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4618 // clang-format off
4619 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4620 .addMBB(&LoopBB);
4621 // clang-format on
4622
4623 return InsertPt->getIterator();
4624}
4625
4626// This has slightly sub-optimal regalloc when the source vector is killed by
4627// the read. The register allocator does not understand that the kill is
4628// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4629// subregister from it, using 1 more VGPR than necessary. This was saved when
4630// this was expanded after register allocation.
4633 unsigned InitResultReg, unsigned PhiReg, int Offset,
4634 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4636 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4637 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4639 const DebugLoc &DL = MI.getDebugLoc();
4641
4642 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4643 Register DstReg = MI.getOperand(0).getReg();
4644 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4645 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4646 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4647 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4648
4649 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4650
4651 // Save the EXEC mask
4652 // clang-format off
4653 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4654 .addReg(Exec);
4655 // clang-format on
4656
4657 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4658
4659 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4660
4661 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4662 InitResultReg, DstReg, PhiReg, TmpExec,
4663 Offset, UseGPRIdxMode, SGPRIdxReg);
4664
4665 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4667 ++MBBI;
4668 MF->insert(MBBI, LandingPad);
4669 LoopBB->removeSuccessor(RemainderBB);
4670 LandingPad->addSuccessor(RemainderBB);
4671 LoopBB->addSuccessor(LandingPad);
4672 MachineBasicBlock::iterator First = LandingPad->begin();
4673 // clang-format off
4674 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4675 .addReg(SaveExec);
4676 // clang-format on
4677
4678 return InsPt;
4679}
4680
4681// Returns subreg index, offset
4682static std::pair<unsigned, int>
4684 const TargetRegisterClass *SuperRC, unsigned VecReg,
4685 int Offset) {
4686 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4687
4688 // Skip out of bounds offsets, or else we would end up using an undefined
4689 // register.
4690 if (Offset >= NumElts || Offset < 0)
4691 return std::pair(AMDGPU::sub0, Offset);
4692
4693 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4694}
4695
4698 int Offset) {
4699 MachineBasicBlock *MBB = MI.getParent();
4700 const DebugLoc &DL = MI.getDebugLoc();
4702
4703 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4704
4705 assert(Idx->getReg() != AMDGPU::NoRegister);
4706
4707 if (Offset == 0) {
4708 // clang-format off
4709 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4710 .add(*Idx);
4711 // clang-format on
4712 } else {
4713 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4714 .add(*Idx)
4715 .addImm(Offset);
4716 }
4717}
4718
4721 int Offset) {
4722 MachineBasicBlock *MBB = MI.getParent();
4723 const DebugLoc &DL = MI.getDebugLoc();
4725
4726 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4727
4728 if (Offset == 0)
4729 return Idx->getReg();
4730
4731 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4732 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4733 .add(*Idx)
4734 .addImm(Offset);
4735 return Tmp;
4736}
4737
4740 const GCNSubtarget &ST) {
4741 const SIInstrInfo *TII = ST.getInstrInfo();
4742 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4745
4746 Register Dst = MI.getOperand(0).getReg();
4747 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4748 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4749 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4750
4751 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4752 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4753
4754 unsigned SubReg;
4755 std::tie(SubReg, Offset) =
4756 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4757
4758 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4759
4760 // Check for a SGPR index.
4761 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4763 const DebugLoc &DL = MI.getDebugLoc();
4764
4765 if (UseGPRIdxMode) {
4766 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4767 // to avoid interfering with other uses, so probably requires a new
4768 // optimization pass.
4770
4771 const MCInstrDesc &GPRIDXDesc =
4772 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4773 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4774 .addReg(SrcReg)
4775 .addReg(Idx)
4776 .addImm(SubReg);
4777 } else {
4779
4780 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4781 .addReg(SrcReg, 0, SubReg)
4782 .addReg(SrcReg, RegState::Implicit);
4783 }
4784
4785 MI.eraseFromParent();
4786
4787 return &MBB;
4788 }
4789
4790 // Control flow needs to be inserted if indexing with a VGPR.
4791 const DebugLoc &DL = MI.getDebugLoc();
4793
4794 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4795 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4796
4797 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4798
4799 Register SGPRIdxReg;
4800 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4801 UseGPRIdxMode, SGPRIdxReg);
4802
4803 MachineBasicBlock *LoopBB = InsPt->getParent();
4804
4805 if (UseGPRIdxMode) {
4806 const MCInstrDesc &GPRIDXDesc =
4807 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4808
4809 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4810 .addReg(SrcReg)
4811 .addReg(SGPRIdxReg)
4812 .addImm(SubReg);
4813 } else {
4814 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4815 .addReg(SrcReg, 0, SubReg)
4816 .addReg(SrcReg, RegState::Implicit);
4817 }
4818
4819 MI.eraseFromParent();
4820
4821 return LoopBB;
4822}
4823
4826 const GCNSubtarget &ST) {
4827 const SIInstrInfo *TII = ST.getInstrInfo();
4828 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4831
4832 Register Dst = MI.getOperand(0).getReg();
4833 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4834 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4835 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4836 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4837 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4838 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4839
4840 // This can be an immediate, but will be folded later.
4841 assert(Val->getReg());
4842
4843 unsigned SubReg;
4844 std::tie(SubReg, Offset) =
4845 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4846 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4847
4848 if (Idx->getReg() == AMDGPU::NoRegister) {
4850 const DebugLoc &DL = MI.getDebugLoc();
4851
4852 assert(Offset == 0);
4853
4854 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4855 .add(*SrcVec)
4856 .add(*Val)
4857 .addImm(SubReg);
4858
4859 MI.eraseFromParent();
4860 return &MBB;
4861 }
4862
4863 // Check for a SGPR index.
4864 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4866 const DebugLoc &DL = MI.getDebugLoc();
4867
4868 if (UseGPRIdxMode) {
4870
4871 const MCInstrDesc &GPRIDXDesc =
4872 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4873 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4874 .addReg(SrcVec->getReg())
4875 .add(*Val)
4876 .addReg(Idx)
4877 .addImm(SubReg);
4878 } else {
4880
4881 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4882 TRI.getRegSizeInBits(*VecRC), 32, false);
4883 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4884 .addReg(SrcVec->getReg())
4885 .add(*Val)
4886 .addImm(SubReg);
4887 }
4888 MI.eraseFromParent();
4889 return &MBB;
4890 }
4891
4892 // Control flow needs to be inserted if indexing with a VGPR.
4893 if (Val->isReg())
4894 MRI.clearKillFlags(Val->getReg());
4895
4896 const DebugLoc &DL = MI.getDebugLoc();
4897
4898 Register PhiReg = MRI.createVirtualRegister(VecRC);
4899
4900 Register SGPRIdxReg;
4901 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4902 UseGPRIdxMode, SGPRIdxReg);
4903 MachineBasicBlock *LoopBB = InsPt->getParent();
4904
4905 if (UseGPRIdxMode) {
4906 const MCInstrDesc &GPRIDXDesc =
4907 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4908
4909 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4910 .addReg(PhiReg)
4911 .add(*Val)
4912 .addReg(SGPRIdxReg)
4913 .addImm(SubReg);
4914 } else {
4915 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4916 TRI.getRegSizeInBits(*VecRC), 32, false);
4917 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4918 .addReg(PhiReg)
4919 .add(*Val)
4920 .addImm(SubReg);
4921 }
4922
4923 MI.eraseFromParent();
4924 return LoopBB;
4925}
4926
4929 const GCNSubtarget &ST,
4930 unsigned Opc) {
4932 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4933 const DebugLoc &DL = MI.getDebugLoc();
4934 const SIInstrInfo *TII = ST.getInstrInfo();
4935
4936 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4937 Register SrcReg = MI.getOperand(1).getReg();
4938 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4939 Register DstReg = MI.getOperand(0).getReg();
4940 MachineBasicBlock *RetBB = nullptr;
4941 if (isSGPR) {
4942 // These operations with a uniform value i.e. SGPR are idempotent.
4943 // Reduced value will be same as given sgpr.
4944 // clang-format off
4945 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4946 .addReg(SrcReg);
4947 // clang-format on
4948 RetBB = &BB;
4949 } else {
4950 // TODO: Implement DPP Strategy and switch based on immediate strategy
4951 // operand. For now, for all the cases (default, Iterative and DPP we use
4952 // iterative approach by default.)
4953
4954 // To reduce the VGPR using iterative approach, we need to iterate
4955 // over all the active lanes. Lowering consists of ComputeLoop,
4956 // which iterate over only active lanes. We use copy of EXEC register
4957 // as induction variable and every active lane modifies it using bitset0
4958 // so that we will get the next active lane for next iteration.
4960 Register SrcReg = MI.getOperand(1).getReg();
4961
4962 // Create Control flow for loop
4963 // Split MI's Machine Basic block into For loop
4964 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4965
4966 // Create virtual registers required for lowering.
4967 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4968 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4969 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4970 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4971
4972 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4973 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4974 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4975
4976 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4977 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4978
4979 bool IsWave32 = ST.isWave32();
4980 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4981 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4982
4983 // Create initail values of induction variable from Exec, Accumulator and
4984 // insert branch instr to newly created ComputeBlockk
4985 uint32_t InitalValue =
4986 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4987 auto TmpSReg =
4988 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4989 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4990 .addImm(InitalValue);
4991 // clang-format off
4992 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
4993 .addMBB(ComputeLoop);
4994 // clang-format on
4995
4996 // Start constructing ComputeLoop
4997 I = ComputeLoop->end();
4998 auto Accumulator =
4999 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5000 .addReg(InitalValReg)
5001 .addMBB(&BB);
5002 auto ActiveBits =
5003 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5004 .addReg(TmpSReg->getOperand(0).getReg())
5005 .addMBB(&BB);
5006
5007 // Perform the computations
5008 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5009 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5010 .addReg(ActiveBits->getOperand(0).getReg());
5011 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5012 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5013 .addReg(SrcReg)
5014 .addReg(FF1->getOperand(0).getReg());
5015 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5016 .addReg(Accumulator->getOperand(0).getReg())
5017 .addReg(LaneValue->getOperand(0).getReg());
5018
5019 // Manipulate the iterator to get the next active lane
5020 unsigned BITSETOpc =
5021 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5022 auto NewActiveBits =
5023 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5024 .addReg(FF1->getOperand(0).getReg())
5025 .addReg(ActiveBits->getOperand(0).getReg());
5026
5027 // Add phi nodes
5028 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5029 .addMBB(ComputeLoop);
5030 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5031 .addMBB(ComputeLoop);
5032
5033 // Creating branching
5034 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5035 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5036 .addReg(NewActiveBits->getOperand(0).getReg())
5037 .addImm(0);
5038 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5039 .addMBB(ComputeLoop);
5040
5041 RetBB = ComputeEnd;
5042 }
5043 MI.eraseFromParent();
5044 return RetBB;
5045}
5046
5049 MachineBasicBlock *BB) const {
5050
5052 MachineFunction *MF = BB->getParent();
5054
5055 switch (MI.getOpcode()) {
5056 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5057 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5058 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5059 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5060 case AMDGPU::S_UADDO_PSEUDO:
5061 case AMDGPU::S_USUBO_PSEUDO: {
5062 const DebugLoc &DL = MI.getDebugLoc();
5063 MachineOperand &Dest0 = MI.getOperand(0);
5064 MachineOperand &Dest1 = MI.getOperand(1);
5065 MachineOperand &Src0 = MI.getOperand(2);
5066 MachineOperand &Src1 = MI.getOperand(3);
5067
5068 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5069 ? AMDGPU::S_ADD_I32
5070 : AMDGPU::S_SUB_I32;
5071 // clang-format off
5072 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5073 .add(Src0)
5074 .add(Src1);
5075 // clang-format on
5076
5077 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5078 .addImm(1)
5079 .addImm(0);
5080
5081 MI.eraseFromParent();
5082 return BB;
5083 }
5084 case AMDGPU::S_ADD_U64_PSEUDO:
5085 case AMDGPU::S_SUB_U64_PSEUDO: {
5086 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5087 // For GFX12, we emit s_add_u64 and s_sub_u64.
5088 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5090 const DebugLoc &DL = MI.getDebugLoc();
5091 MachineOperand &Dest = MI.getOperand(0);
5092 MachineOperand &Src0 = MI.getOperand(1);
5093 MachineOperand &Src1 = MI.getOperand(2);
5094 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5095 if (Subtarget->hasScalarAddSub64()) {
5096 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5097 // clang-format off
5098 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5099 .add(Src0)
5100 .add(Src1);
5101 // clang-format on
5102 } else {
5103 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5104 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5105
5106 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5108
5109 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5110 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5111 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5112 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5113
5114 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5115 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5116 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5117 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5118
5119 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5120 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5121 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5122 .add(Src0Sub0)
5123 .add(Src1Sub0);
5124 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5125 .add(Src0Sub1)
5126 .add(Src1Sub1);
5127 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5128 .addReg(DestSub0)
5129 .addImm(AMDGPU::sub0)
5130 .addReg(DestSub1)
5131 .addImm(AMDGPU::sub1);
5132 }
5133 MI.eraseFromParent();
5134 return BB;
5135 }
5136 case AMDGPU::V_ADD_U64_PSEUDO:
5137 case AMDGPU::V_SUB_U64_PSEUDO: {
5139 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5140 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5141 const DebugLoc &DL = MI.getDebugLoc();
5142
5143 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5144
5145 MachineOperand &Dest = MI.getOperand(0);
5146 MachineOperand &Src0 = MI.getOperand(1);
5147 MachineOperand &Src1 = MI.getOperand(2);
5148
5149 if (IsAdd && ST.hasLshlAddB64()) {
5150 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5151 Dest.getReg())
5152 .add(Src0)
5153 .addImm(0)
5154 .add(Src1);
5155 TII->legalizeOperands(*Add);
5156 MI.eraseFromParent();
5157 return BB;
5158 }
5159
5160 const auto *CarryRC = TRI->getWaveMaskRegClass();
5161
5162 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5163 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5164
5165 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5166 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5167
5168 const TargetRegisterClass *Src0RC = Src0.isReg()
5169 ? MRI.getRegClass(Src0.getReg())
5170 : &AMDGPU::VReg_64RegClass;
5171 const TargetRegisterClass *Src1RC = Src1.isReg()
5172 ? MRI.getRegClass(Src1.getReg())
5173 : &AMDGPU::VReg_64RegClass;
5174
5175 const TargetRegisterClass *Src0SubRC =
5176 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5177 const TargetRegisterClass *Src1SubRC =
5178 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5179
5180 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5181 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5182 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5183 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5184
5185 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5186 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5187 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5188 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5189
5190 unsigned LoOpc =
5191 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5192 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5193 .addReg(CarryReg, RegState::Define)
5194 .add(SrcReg0Sub0)
5195 .add(SrcReg1Sub0)
5196 .addImm(0); // clamp bit
5197
5198 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5199 MachineInstr *HiHalf =
5200 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5201 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5202 .add(SrcReg0Sub1)
5203 .add(SrcReg1Sub1)
5204 .addReg(CarryReg, RegState::Kill)
5205 .addImm(0); // clamp bit
5206
5207 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5208 .addReg(DestSub0)
5209 .addImm(AMDGPU::sub0)
5210 .addReg(DestSub1)
5211 .addImm(AMDGPU::sub1);
5212 TII->legalizeOperands(*LoHalf);
5213 TII->legalizeOperands(*HiHalf);
5214 MI.eraseFromParent();
5215 return BB;
5216 }
5217 case AMDGPU::S_ADD_CO_PSEUDO:
5218 case AMDGPU::S_SUB_CO_PSEUDO: {
5219 // This pseudo has a chance to be selected
5220 // only from uniform add/subcarry node. All the VGPR operands
5221 // therefore assumed to be splat vectors.
5223 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5224 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5226 const DebugLoc &DL = MI.getDebugLoc();
5227 MachineOperand &Dest = MI.getOperand(0);
5228 MachineOperand &CarryDest = MI.getOperand(1);
5229 MachineOperand &Src0 = MI.getOperand(2);
5230 MachineOperand &Src1 = MI.getOperand(3);
5231 MachineOperand &Src2 = MI.getOperand(4);
5232 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5233 ? AMDGPU::S_ADDC_U32
5234 : AMDGPU::S_SUBB_U32;
5235 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5236 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5238 .addReg(Src0.getReg());
5239 Src0.setReg(RegOp0);
5240 }
5241 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5242 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5243 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5244 .addReg(Src1.getReg());
5245 Src1.setReg(RegOp1);
5246 }
5247 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5248 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5249 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5250 .addReg(Src2.getReg());
5251 Src2.setReg(RegOp2);
5252 }
5253
5254 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5255 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5256 assert(WaveSize == 64 || WaveSize == 32);
5257
5258 if (WaveSize == 64) {
5259 if (ST.hasScalarCompareEq64()) {
5260 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5261 .addReg(Src2.getReg())
5262 .addImm(0);
5263 } else {
5264 const TargetRegisterClass *SubRC =
5265 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5266 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5267 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5268 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5269 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5270 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5271
5272 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5273 .add(Src2Sub0)
5274 .add(Src2Sub1);
5275
5276 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5277 .addReg(Src2_32, RegState::Kill)
5278 .addImm(0);
5279 }
5280 } else {
5281 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5282 .addReg(Src2.getReg())
5283 .addImm(0);
5284 }
5285
5286 // clang-format off
5287 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5288 .add(Src0)
5289 .add(Src1);
5290 // clang-format on
5291
5292 unsigned SelOpc =
5293 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5294
5295 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5296 .addImm(-1)
5297 .addImm(0);
5298
5299 MI.eraseFromParent();
5300 return BB;
5301 }
5302 case AMDGPU::SI_INIT_M0: {
5303 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5304 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5305 .add(MI.getOperand(0));
5306 MI.eraseFromParent();
5307 return BB;
5308 }
5309 case AMDGPU::GET_GROUPSTATICSIZE: {
5310 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5311 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5312 DebugLoc DL = MI.getDebugLoc();
5313 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5314 .add(MI.getOperand(0))
5315 .addImm(MFI->getLDSSize());
5316 MI.eraseFromParent();
5317 return BB;
5318 }
5319 case AMDGPU::GET_SHADERCYCLESHILO: {
5322 const DebugLoc &DL = MI.getDebugLoc();
5323 // The algorithm is:
5324 //
5325 // hi1 = getreg(SHADER_CYCLES_HI)
5326 // lo1 = getreg(SHADER_CYCLES_LO)
5327 // hi2 = getreg(SHADER_CYCLES_HI)
5328 //
5329 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5330 // Otherwise there was overflow and the result is hi2:0. In both cases the
5331 // result should represent the actual time at some point during the sequence
5332 // of three getregs.
5333 using namespace AMDGPU::Hwreg;
5334 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5335 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5336 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5337 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5338 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5339 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5340 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5341 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5342 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5343 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5344 .addReg(RegHi1)
5345 .addReg(RegHi2);
5346 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5347 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5348 .addReg(RegLo1)
5349 .addImm(0);
5350 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5351 .add(MI.getOperand(0))
5352 .addReg(RegLo)
5353 .addImm(AMDGPU::sub0)
5354 .addReg(RegHi2)
5355 .addImm(AMDGPU::sub1);
5356 MI.eraseFromParent();
5357 return BB;
5358 }
5359 case AMDGPU::SI_INDIRECT_SRC_V1:
5360 case AMDGPU::SI_INDIRECT_SRC_V2:
5361 case AMDGPU::SI_INDIRECT_SRC_V4:
5362 case AMDGPU::SI_INDIRECT_SRC_V8:
5363 case AMDGPU::SI_INDIRECT_SRC_V9:
5364 case AMDGPU::SI_INDIRECT_SRC_V10:
5365 case AMDGPU::SI_INDIRECT_SRC_V11:
5366 case AMDGPU::SI_INDIRECT_SRC_V12:
5367 case AMDGPU::SI_INDIRECT_SRC_V16:
5368 case AMDGPU::SI_INDIRECT_SRC_V32:
5369 return emitIndirectSrc(MI, *BB, *getSubtarget());
5370 case AMDGPU::SI_INDIRECT_DST_V1:
5371 case AMDGPU::SI_INDIRECT_DST_V2:
5372 case AMDGPU::SI_INDIRECT_DST_V4:
5373 case AMDGPU::SI_INDIRECT_DST_V8:
5374 case AMDGPU::SI_INDIRECT_DST_V9:
5375 case AMDGPU::SI_INDIRECT_DST_V10:
5376 case AMDGPU::SI_INDIRECT_DST_V11:
5377 case AMDGPU::SI_INDIRECT_DST_V12:
5378 case AMDGPU::SI_INDIRECT_DST_V16:
5379 case AMDGPU::SI_INDIRECT_DST_V32:
5380 return emitIndirectDst(MI, *BB, *getSubtarget());
5381 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5382 case AMDGPU::SI_KILL_I1_PSEUDO:
5383 return splitKillBlock(MI, BB);
5384 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5386 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5387 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5388
5389 Register Dst = MI.getOperand(0).getReg();
5390 const MachineOperand &Src0 = MI.getOperand(1);
5391 const MachineOperand &Src1 = MI.getOperand(2);
5392 const DebugLoc &DL = MI.getDebugLoc();
5393 Register SrcCond = MI.getOperand(3).getReg();
5394
5395 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5396 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5397 const auto *CondRC = TRI->getWaveMaskRegClass();
5398 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5399
5400 const TargetRegisterClass *Src0RC = Src0.isReg()
5401 ? MRI.getRegClass(Src0.getReg())
5402 : &AMDGPU::VReg_64RegClass;
5403 const TargetRegisterClass *Src1RC = Src1.isReg()
5404 ? MRI.getRegClass(Src1.getReg())
5405 : &AMDGPU::VReg_64RegClass;
5406
5407 const TargetRegisterClass *Src0SubRC =
5408 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5409 const TargetRegisterClass *Src1SubRC =
5410 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5411
5412 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5413 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5414 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5415 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5416
5417 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5418 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5419 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5420 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5421
5422 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5423 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5424 .addImm(0)
5425 .add(Src0Sub0)
5426 .addImm(0)
5427 .add(Src1Sub0)
5428 .addReg(SrcCondCopy);
5429 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5430 .addImm(0)
5431 .add(Src0Sub1)
5432 .addImm(0)
5433 .add(Src1Sub1)
5434 .addReg(SrcCondCopy);
5435
5436 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5437 .addReg(DstLo)
5438 .addImm(AMDGPU::sub0)
5439 .addReg(DstHi)
5440 .addImm(AMDGPU::sub1);
5441 MI.eraseFromParent();
5442 return BB;
5443 }
5444 case AMDGPU::SI_BR_UNDEF: {
5446 const DebugLoc &DL = MI.getDebugLoc();
5447 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5448 .add(MI.getOperand(0));
5449 Br->getOperand(1).setIsUndef(); // read undef SCC
5450 MI.eraseFromParent();
5451 return BB;
5452 }
5453 case AMDGPU::ADJCALLSTACKUP:
5454 case AMDGPU::ADJCALLSTACKDOWN: {
5456 MachineInstrBuilder MIB(*MF, &MI);
5457 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5458 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5459 return BB;
5460 }
5461 case AMDGPU::SI_CALL_ISEL: {
5463 const DebugLoc &DL = MI.getDebugLoc();
5464
5465 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5466
5468 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5469
5470 for (const MachineOperand &MO : MI.operands())
5471 MIB.add(MO);
5472
5473 MIB.cloneMemRefs(MI);
5474 MI.eraseFromParent();
5475 return BB;
5476 }
5477 case AMDGPU::V_ADD_CO_U32_e32:
5478 case AMDGPU::V_SUB_CO_U32_e32:
5479 case AMDGPU::V_SUBREV_CO_U32_e32: {
5480 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5481 const DebugLoc &DL = MI.getDebugLoc();
5482 unsigned Opc = MI.getOpcode();
5483
5484 bool NeedClampOperand = false;
5485 if (TII->pseudoToMCOpcode(Opc) == -1) {
5486 Opc = AMDGPU::getVOPe64(Opc);
5487 NeedClampOperand = true;
5488 }
5489
5490 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5491 if (TII->isVOP3(*I)) {
5492 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5493 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5494 I.addReg(TRI->getVCC(), RegState::Define);
5495 }
5496 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5497 if (NeedClampOperand)
5498 I.addImm(0); // clamp bit for e64 encoding
5499
5500 TII->legalizeOperands(*I);
5501
5502 MI.eraseFromParent();
5503 return BB;
5504 }
5505 case AMDGPU::V_ADDC_U32_e32:
5506 case AMDGPU::V_SUBB_U32_e32:
5507 case AMDGPU::V_SUBBREV_U32_e32:
5508 // These instructions have an implicit use of vcc which counts towards the
5509 // constant bus limit.
5510 TII->legalizeOperands(MI);
5511 return BB;
5512 case AMDGPU::DS_GWS_INIT:
5513 case AMDGPU::DS_GWS_SEMA_BR:
5514 case AMDGPU::DS_GWS_BARRIER:
5515 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5516 [[fallthrough]];
5517 case AMDGPU::DS_GWS_SEMA_V:
5518 case AMDGPU::DS_GWS_SEMA_P:
5519 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5520 // A s_waitcnt 0 is required to be the instruction immediately following.
5521 if (getSubtarget()->hasGWSAutoReplay()) {
5523 return BB;
5524 }
5525
5526 return emitGWSMemViolTestLoop(MI, BB);
5527 case AMDGPU::S_SETREG_B32: {
5528 // Try to optimize cases that only set the denormal mode or rounding mode.
5529 //
5530 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5531 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5532 // instead.
5533 //
5534 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5535 // allow you to have a no side effect instruction in the output of a
5536 // sideeffecting pattern.
5537 auto [ID, Offset, Width] =
5538 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5540 return BB;
5541
5542 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5543 const unsigned SetMask = WidthMask << Offset;
5544
5545 if (getSubtarget()->hasDenormModeInst()) {
5546 unsigned SetDenormOp = 0;
5547 unsigned SetRoundOp = 0;
5548
5549 // The dedicated instructions can only set the whole denorm or round mode
5550 // at once, not a subset of bits in either.
5551 if (SetMask ==
5553 // If this fully sets both the round and denorm mode, emit the two
5554 // dedicated instructions for these.
5555 SetRoundOp = AMDGPU::S_ROUND_MODE;
5556 SetDenormOp = AMDGPU::S_DENORM_MODE;
5557 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5558 SetRoundOp = AMDGPU::S_ROUND_MODE;
5559 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5560 SetDenormOp = AMDGPU::S_DENORM_MODE;
5561 }
5562
5563 if (SetRoundOp || SetDenormOp) {
5565 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5566 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5567 unsigned ImmVal = Def->getOperand(1).getImm();
5568 if (SetRoundOp) {
5569 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5570 .addImm(ImmVal & 0xf);
5571
5572 // If we also have the denorm mode, get just the denorm mode bits.
5573 ImmVal >>= 4;
5574 }
5575
5576 if (SetDenormOp) {
5577 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5578 .addImm(ImmVal & 0xf);
5579 }
5580
5581 MI.eraseFromParent();
5582 return BB;
5583 }
5584 }
5585 }
5586
5587 // If only FP bits are touched, used the no side effects pseudo.
5588 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5589 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5590 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5591
5592 return BB;
5593 }
5594 case AMDGPU::S_INVERSE_BALLOT_U32:
5595 case AMDGPU::S_INVERSE_BALLOT_U64:
5596 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5597 // necessary. After that they are equivalent to a COPY.
5598 MI.setDesc(TII->get(AMDGPU::COPY));
5599 return BB;
5600 case AMDGPU::ENDPGM_TRAP: {
5601 const DebugLoc &DL = MI.getDebugLoc();
5602 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5603 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5604 MI.addOperand(MachineOperand::CreateImm(0));
5605 return BB;
5606 }
5607
5608 // We need a block split to make the real endpgm a terminator. We also don't
5609 // want to break phis in successor blocks, so we can't just delete to the
5610 // end of the block.
5611
5612 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5614 MF->push_back(TrapBB);
5615 // clang-format off
5616 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5617 .addImm(0);
5618 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5619 .addMBB(TrapBB);
5620 // clang-format on
5621
5622 BB->addSuccessor(TrapBB);
5623 MI.eraseFromParent();
5624 return SplitBB;
5625 }
5626 case AMDGPU::SIMULATED_TRAP: {
5627 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5629 MachineBasicBlock *SplitBB =
5630 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5631 MI.eraseFromParent();
5632 return SplitBB;
5633 }
5634 default:
5635 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5636 if (!MI.mayStore())
5638 return BB;
5639 }
5641 }
5642}
5643
5645 // This currently forces unfolding various combinations of fsub into fma with
5646 // free fneg'd operands. As long as we have fast FMA (controlled by
5647 // isFMAFasterThanFMulAndFAdd), we should perform these.
5648
5649 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5650 // most of these combines appear to be cycle neutral but save on instruction
5651 // count / code size.
5652 return true;
5653}
5654
5656
5658 EVT VT) const {
5659 if (!VT.isVector()) {
5660 return MVT::i1;
5661 }
5662 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5663}
5664
5666 // TODO: Should i16 be used always if legal? For now it would force VALU
5667 // shifts.
5668 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5669}
5670
5672 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5673 ? Ty.changeElementSize(16)
5674 : Ty.changeElementSize(32);
5675}
5676
5677// Answering this is somewhat tricky and depends on the specific device which
5678// have different rates for fma or all f64 operations.
5679//
5680// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5681// regardless of which device (although the number of cycles differs between
5682// devices), so it is always profitable for f64.
5683//
5684// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5685// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5686// which we can always do even without fused FP ops since it returns the same
5687// result as the separate operations and since it is always full
5688// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5689// however does not support denormals, so we do report fma as faster if we have
5690// a fast fma device and require denormals.
5691//
5693 EVT VT) const {
5694 VT = VT.getScalarType();
5695
5696 switch (VT.getSimpleVT().SimpleTy) {
5697 case MVT::f32: {
5698 // If mad is not available this depends only on if f32 fma is full rate.
5699 if (!Subtarget->hasMadMacF32Insts())
5700 return Subtarget->hasFastFMAF32();
5701
5702 // Otherwise f32 mad is always full rate and returns the same result as
5703 // the separate operations so should be preferred over fma.
5704 // However does not support denormals.
5706 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5707
5708 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5709 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5710 }
5711 case MVT::f64:
5712 return true;
5713 case MVT::f16:
5714 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5715 default:
5716 break;
5717 }
5718
5719 return false;
5720}
5721
5723 LLT Ty) const {
5724 switch (Ty.getScalarSizeInBits()) {
5725 case 16:
5726 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5727 case 32:
5728 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5729 case 64:
5730 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5731 default:
5732 break;
5733 }
5734
5735 return false;
5736}
5737
5738// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5739// specific details.
5741 Type *Ty) const {
5742 switch (Ty->getScalarSizeInBits()) {
5743 case 16: {
5745 return Subtarget->has16BitInsts() &&
5746 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5747 }
5748 case 32: {
5749 if (!Subtarget->hasMadMacF32Insts())
5750 return Subtarget->hasFastFMAF32();
5751
5753 if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5754 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5755
5756 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5757 }
5758 case 64:
5759 return true;
5760 default:
5761 break;
5762 }
5763
5764 return false;
5765}
5766
5768 if (!Ty.isScalar())
5769 return false;
5770
5771 if (Ty.getScalarSizeInBits() == 16)
5772 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5773 if (Ty.getScalarSizeInBits() == 32)
5774 return Subtarget->hasMadMacF32Insts() &&
5775 denormalModeIsFlushAllF32(*MI.getMF());
5776
5777 return false;
5778}
5779
5781 const SDNode *N) const {
5782 // TODO: Check future ftz flag
5783 // v_mad_f32/v_mac_f32 do not support denormals.
5784 EVT VT = N->getValueType(0);
5785 if (VT == MVT::f32)
5786 return Subtarget->hasMadMacF32Insts() &&
5788 if (VT == MVT::f16) {
5789 return Subtarget->hasMadF16() &&
5791 }
5792
5793 return false;
5794}
5795
5796//===----------------------------------------------------------------------===//
5797// Custom DAG Lowering Operations
5798//===----------------------------------------------------------------------===//
5799
5800// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5801// wider vector type is legal.
5803 SelectionDAG &DAG) const {
5804 unsigned Opc = Op.getOpcode();
5805 EVT VT = Op.getValueType();
5806 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5807 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5808 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5809 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5810
5811 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5812
5813 SDLoc SL(Op);
5814 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5815 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5816
5817 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5818}
5819
5820// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5821// wider vector type is legal.
5823 SelectionDAG &DAG) const {
5824 unsigned Opc = Op.getOpcode();
5825 EVT VT = Op.getValueType();
5826 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5827 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5828 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5829 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5830
5831 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5832 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5833
5834 SDLoc SL(Op);
5835
5836 SDValue OpLo =
5837 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5838 SDValue OpHi =
5839 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5840
5841 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5842}
5843
5845 SelectionDAG &DAG) const {
5846 unsigned Opc = Op.getOpcode();
5847 EVT VT = Op.getValueType();
5848 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5849 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5850 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5851 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5852 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5853 VT == MVT::v32bf16);
5854
5855 SDValue Op0 = Op.getOperand(0);
5856 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5857 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5858 : std::pair(Op0, Op0);
5859
5860 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5861 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5862
5863 SDLoc SL(Op);
5864 auto ResVT = DAG.GetSplitDestVTs(VT);
5865
5866 SDValue OpLo =
5867 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5868 SDValue OpHi =
5869 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5870
5871 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5872}
5873
5875 switch (Op.getOpcode()) {
5876 default:
5878 case ISD::BRCOND:
5879 return LowerBRCOND(Op, DAG);
5880 case ISD::RETURNADDR:
5881 return LowerRETURNADDR(Op, DAG);
5882 case ISD::LOAD: {
5883 SDValue Result = LowerLOAD(Op, DAG);
5884 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5885 "Load should return a value and a chain");
5886 return Result;
5887 }
5888 case ISD::FSQRT: {
5889 EVT VT = Op.getValueType();
5890 if (VT == MVT::f32)
5891 return lowerFSQRTF32(Op, DAG);
5892 if (VT == MVT::f64)
5893 return lowerFSQRTF64(Op, DAG);
5894 return SDValue();
5895 }
5896 case ISD::FSIN:
5897 case ISD::FCOS:
5898 return LowerTrig(Op, DAG);
5899 case ISD::SELECT:
5900 return LowerSELECT(Op, DAG);
5901 case ISD::FDIV:
5902 return LowerFDIV(Op, DAG);
5903 case ISD::FFREXP:
5904 return LowerFFREXP(Op, DAG);
5906 return LowerATOMIC_CMP_SWAP(Op, DAG);
5907 case ISD::STORE:
5908 return LowerSTORE(Op, DAG);
5909 case ISD::GlobalAddress: {
5912 return LowerGlobalAddress(MFI, Op, DAG);
5913 }
5915 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5917 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5919 return LowerINTRINSIC_VOID(Op, DAG);
5920 case ISD::ADDRSPACECAST:
5921 return lowerADDRSPACECAST(Op, DAG);
5923 return lowerINSERT_SUBVECTOR(Op, DAG);
5925 return lowerINSERT_VECTOR_ELT(Op, DAG);
5927 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5929 return lowerVECTOR_SHUFFLE(Op, DAG);
5931 return lowerSCALAR_TO_VECTOR(Op, DAG);
5932 case ISD::BUILD_VECTOR:
5933 return lowerBUILD_VECTOR(Op, DAG);
5934 case ISD::FP_ROUND:
5936 return lowerFP_ROUND(Op, DAG);
5937 case ISD::TRAP:
5938 return lowerTRAP(Op, DAG);
5939 case ISD::DEBUGTRAP:
5940 return lowerDEBUGTRAP(Op, DAG);
5941 case ISD::ABS:
5942 case ISD::FABS:
5943 case ISD::FNEG:
5944 case ISD::FCANONICALIZE:
5945 case ISD::BSWAP:
5946 return splitUnaryVectorOp(Op, DAG);
5947 case ISD::FMINNUM:
5948 case ISD::FMAXNUM:
5949 return lowerFMINNUM_FMAXNUM(Op, DAG);
5950 case ISD::FLDEXP:
5951 case ISD::STRICT_FLDEXP:
5952 return lowerFLDEXP(Op, DAG);
5953 case ISD::FMA:
5954 return splitTernaryVectorOp(Op, DAG);
5955 case ISD::FP_TO_SINT:
5956 case ISD::FP_TO_UINT:
5957 return LowerFP_TO_INT(Op, DAG);
5958 case ISD::SHL:
5959 case ISD::SRA:
5960 case ISD::SRL:
5961 case ISD::ADD:
5962 case ISD::SUB:
5963 case ISD::SMIN:
5964 case ISD::SMAX:
5965 case ISD::UMIN:
5966 case ISD::UMAX:
5967 case ISD::FADD:
5968 case ISD::FMUL:
5969 case ISD::FMINNUM_IEEE:
5970 case ISD::FMAXNUM_IEEE:
5971 case ISD::FMINIMUM:
5972 case ISD::FMAXIMUM:
5973 case ISD::FMINIMUMNUM:
5974 case ISD::FMAXIMUMNUM:
5975 case ISD::UADDSAT:
5976 case ISD::USUBSAT:
5977 case ISD::SADDSAT:
5978 case ISD::SSUBSAT:
5979 return splitBinaryVectorOp(Op, DAG);
5980 case ISD::MUL:
5981 return lowerMUL(Op, DAG);
5982 case ISD::SMULO:
5983 case ISD::UMULO:
5984 return lowerXMULO(Op, DAG);
5985 case ISD::SMUL_LOHI:
5986 case ISD::UMUL_LOHI:
5987 return lowerXMUL_LOHI(Op, DAG);
5989 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5990 case ISD::STACKSAVE:
5991 return LowerSTACKSAVE(Op, DAG);
5992 case ISD::GET_ROUNDING:
5993 return lowerGET_ROUNDING(Op, DAG);
5994 case ISD::SET_ROUNDING:
5995 return lowerSET_ROUNDING(Op, DAG);
5996 case ISD::PREFETCH:
5997 return lowerPREFETCH(Op, DAG);
5998 case ISD::FP_EXTEND:
6000 return lowerFP_EXTEND(Op, DAG);
6001 case ISD::GET_FPENV:
6002 return lowerGET_FPENV(Op, DAG);
6003 case ISD::SET_FPENV:
6004 return lowerSET_FPENV(Op, DAG);
6005 }
6006 return SDValue();
6007}
6008
6009// Used for D16: Casts the result of an instruction into the right vector,
6010// packs values if loads return unpacked values.
6012 const SDLoc &DL, SelectionDAG &DAG,
6013 bool Unpacked) {
6014 if (!LoadVT.isVector())
6015 return Result;
6016
6017 // Cast back to the original packed type or to a larger type that is a
6018 // multiple of 32 bit for D16. Widening the return type is a required for
6019 // legalization.
6020 EVT FittingLoadVT = LoadVT;
6021 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6022 FittingLoadVT =
6024 LoadVT.getVectorNumElements() + 1);
6025 }
6026
6027 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6028 // Truncate to v2i16/v4i16.
6029 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6030
6031 // Workaround legalizer not scalarizing truncate after vector op
6032 // legalization but not creating intermediate vector trunc.
6034 DAG.ExtractVectorElements(Result, Elts);
6035 for (SDValue &Elt : Elts)
6036 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6037
6038 // Pad illegal v1i16/v3fi6 to v4i16
6039 if ((LoadVT.getVectorNumElements() % 2) == 1)
6040 Elts.push_back(DAG.getUNDEF(MVT::i16));
6041
6042 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6043
6044 // Bitcast to original type (v2f16/v4f16).
6045 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6046 }
6047
6048 // Cast back to the original packed type.
6049 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6050}
6051
6052SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6053 SelectionDAG &DAG,
6055 bool IsIntrinsic) const {
6056 SDLoc DL(M);
6057
6058 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6059 EVT LoadVT = M->getValueType(0);
6060
6061 EVT EquivLoadVT = LoadVT;
6062 if (LoadVT.isVector()) {
6063 if (Unpacked) {
6064 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6065 LoadVT.getVectorNumElements());
6066 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6067 // Widen v3f16 to legal type
6068 EquivLoadVT =
6070 LoadVT.getVectorNumElements() + 1);
6071 }
6072 }
6073
6074 // Change from v4f16/v2f16 to EquivLoadVT.
6075 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6076
6078 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6079 M->getMemoryVT(), M->getMemOperand());
6080
6081 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6082
6083 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6084}
6085
6086SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6087 SelectionDAG &DAG,
6088 ArrayRef<SDValue> Ops) const {
6089 SDLoc DL(M);
6090 EVT LoadVT = M->getValueType(0);
6091 EVT EltType = LoadVT.getScalarType();
6092 EVT IntVT = LoadVT.changeTypeToInteger();
6093
6094 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6095
6096 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6097 bool IsTFE = M->getNumValues() == 3;
6098
6099 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6103
6104 if (IsD16) {
6105 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6106 }
6107
6108 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6109 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6110 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6111 IsTFE);
6112
6113 if (isTypeLegal(LoadVT)) {
6114 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6115 M->getMemOperand(), DAG);
6116 }
6117
6118 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6119 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6120 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6121 M->getMemOperand(), DAG);
6122 return DAG.getMergeValues(
6123 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6124 DL);
6125}
6126
6128 SelectionDAG &DAG) {
6129 EVT VT = N->getValueType(0);
6130 unsigned CondCode = N->getConstantOperandVal(3);
6131 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6132 return DAG.getUNDEF(VT);
6133
6134 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6135
6136 SDValue LHS = N->getOperand(1);
6137 SDValue RHS = N->getOperand(2);
6138
6139 SDLoc DL(N);
6140
6141 EVT CmpVT = LHS.getValueType();
6142 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6143 unsigned PromoteOp =
6145 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6146 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6147 }
6148
6149 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6150
6151 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6152 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6153
6154 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6155 DAG.getCondCode(CCOpcode));
6156 if (VT.bitsEq(CCVT))
6157 return SetCC;
6158 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6159}
6160
6162 SelectionDAG &DAG) {
6163 EVT VT = N->getValueType(0);
6164
6165 unsigned CondCode = N->getConstantOperandVal(3);
6166 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6167 return DAG.getUNDEF(VT);
6168
6169 SDValue Src0 = N->getOperand(1);
6170 SDValue Src1 = N->getOperand(2);
6171 EVT CmpVT = Src0.getValueType();
6172 SDLoc SL(N);
6173
6174 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6175 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6176 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6177 }
6178
6179 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6180 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6181 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6182 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6183 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6184 DAG.getCondCode(CCOpcode));
6185 if (VT.bitsEq(CCVT))
6186 return SetCC;
6187 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6188}
6189
6191 SelectionDAG &DAG) {
6192 EVT VT = N->getValueType(0);
6193 SDValue Src = N->getOperand(1);
6194 SDLoc SL(N);
6195
6196 if (Src.getOpcode() == ISD::SETCC) {
6197 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6198 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6199 Src.getOperand(1), Src.getOperand(2));
6200 }
6201 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6202 // (ballot 0) -> 0
6203 if (Arg->isZero())
6204 return DAG.getConstant(0, SL, VT);
6205
6206 // (ballot 1) -> EXEC/EXEC_LO
6207 if (Arg->isOne()) {
6208 Register Exec;
6209 if (VT.getScalarSizeInBits() == 32)
6210 Exec = AMDGPU::EXEC_LO;
6211 else if (VT.getScalarSizeInBits() == 64)
6212 Exec = AMDGPU::EXEC;
6213 else
6214 return SDValue();
6215
6216 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6217 }
6218 }
6219
6220 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6221 // ISD::SETNE)
6222 return DAG.getNode(
6223 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6224 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6225}
6226
6228 SelectionDAG &DAG) {
6229 EVT VT = N->getValueType(0);
6230 unsigned ValSize = VT.getSizeInBits();
6231 unsigned IID = N->getConstantOperandVal(0);
6232 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6233 IID == Intrinsic::amdgcn_permlanex16;
6234 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6235 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6236 SDLoc SL(N);
6237 MVT IntVT = MVT::getIntegerVT(ValSize);
6238 const GCNSubtarget *ST = TLI.getSubtarget();
6239 unsigned SplitSize = 32;
6240 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6241 ST->hasDPALU_DPP() &&
6242 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6243 SplitSize = 64;
6244
6245 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6246 SDValue Src2, MVT ValT) -> SDValue {
6248 switch (IID) {
6249 case Intrinsic::amdgcn_permlane16:
6250 case Intrinsic::amdgcn_permlanex16:
6251 case Intrinsic::amdgcn_update_dpp:
6252 Operands.push_back(N->getOperand(6));
6253 Operands.push_back(N->getOperand(5));
6254 Operands.push_back(N->getOperand(4));
6255 [[fallthrough]];
6256 case Intrinsic::amdgcn_writelane:
6257 Operands.push_back(Src2);
6258 [[fallthrough]];
6259 case Intrinsic::amdgcn_readlane:
6260 case Intrinsic::amdgcn_set_inactive:
6261 case Intrinsic::amdgcn_set_inactive_chain_arg:
6262 case Intrinsic::amdgcn_mov_dpp8:
6263 Operands.push_back(Src1);
6264 [[fallthrough]];
6265 case Intrinsic::amdgcn_readfirstlane:
6266 case Intrinsic::amdgcn_permlane64:
6267 Operands.push_back(Src0);
6268 break;
6269 default:
6270 llvm_unreachable("unhandled lane op");
6271 }
6272
6273 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6274 std::reverse(Operands.begin(), Operands.end());
6275
6276 if (SDNode *GL = N->getGluedNode()) {
6277 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6278 GL = GL->getOperand(0).getNode();
6279 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6280 SDValue(GL, 0)));
6281 }
6282
6283 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6284 };
6285
6286 SDValue Src0 = N->getOperand(1);
6287 SDValue Src1, Src2;
6288 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6289 IID == Intrinsic::amdgcn_mov_dpp8 ||
6290 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6291 Src1 = N->getOperand(2);
6292 if (IID == Intrinsic::amdgcn_writelane ||
6293 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6294 Src2 = N->getOperand(3);
6295 }
6296
6297 if (ValSize == SplitSize) {
6298 // Already legal
6299 return SDValue();
6300 }
6301
6302 if (ValSize < 32) {
6303 bool IsFloat = VT.isFloatingPoint();
6304 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6305 SL, MVT::i32);
6306
6307 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6308 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6309 SL, MVT::i32);
6310 }
6311
6312 if (IID == Intrinsic::amdgcn_writelane) {
6313 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6314 SL, MVT::i32);
6315 }
6316
6317 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6318 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6319 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6320 }
6321
6322 if (ValSize % SplitSize != 0)
6323 return SDValue();
6324
6325 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6326 EVT VT = N->getValueType(0);
6327 unsigned NE = VT.getVectorNumElements();
6328 EVT EltVT = VT.getVectorElementType();
6330 unsigned NumOperands = N->getNumOperands();
6331 SmallVector<SDValue, 4> Operands(NumOperands);
6332 SDNode *GL = N->getGluedNode();
6333
6334 // only handle convergencectrl_glue
6336
6337 for (unsigned i = 0; i != NE; ++i) {
6338 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6339 ++j) {
6340 SDValue Operand = N->getOperand(j);
6341 EVT OperandVT = Operand.getValueType();
6342 if (OperandVT.isVector()) {
6343 // A vector operand; extract a single element.
6344 EVT OperandEltVT = OperandVT.getVectorElementType();
6345 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6346 Operand, DAG.getVectorIdxConstant(i, SL));
6347 } else {
6348 // A scalar operand; just use it as is.
6349 Operands[j] = Operand;
6350 }
6351 }
6352
6353 if (GL)
6354 Operands[NumOperands - 1] =
6355 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6356 SDValue(GL->getOperand(0).getNode(), 0));
6357
6358 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6359 }
6360
6361 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6362 return DAG.getBuildVector(VecVT, SL, Scalars);
6363 };
6364
6365 if (VT.isVector()) {
6366 switch (MVT::SimpleValueType EltTy =
6368 case MVT::i32:
6369 case MVT::f32:
6370 if (SplitSize == 32) {
6371 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6372 return unrollLaneOp(LaneOp.getNode());
6373 }
6374 [[fallthrough]];
6375 case MVT::i16:
6376 case MVT::f16:
6377 case MVT::bf16: {
6378 unsigned SubVecNumElt =
6379 SplitSize / VT.getVectorElementType().getSizeInBits();
6380 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6382 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6383 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6384 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6385 DAG.getConstant(EltIdx, SL, MVT::i32));
6386
6387 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6388 IsPermLane16)
6389 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6390 DAG.getConstant(EltIdx, SL, MVT::i32));
6391
6392 if (IID == Intrinsic::amdgcn_writelane)
6393 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6394 DAG.getConstant(EltIdx, SL, MVT::i32));
6395
6396 Pieces.push_back(
6397 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6398 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6399 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6400 EltIdx += SubVecNumElt;
6401 }
6402 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6403 }
6404 default:
6405 // Handle all other cases by bitcasting to i32 vectors
6406 break;
6407 }
6408 }
6409
6410 MVT VecVT =
6411 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6412 Src0 = DAG.getBitcast(VecVT, Src0);
6413
6414 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6415 Src1 = DAG.getBitcast(VecVT, Src1);
6416
6417 if (IID == Intrinsic::amdgcn_writelane)
6418 Src2 = DAG.getBitcast(VecVT, Src2);
6419
6420 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6421 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6422 return DAG.getBitcast(VT, UnrolledLaneOp);
6423}
6424
6427 SelectionDAG &DAG) const {
6428 switch (N->getOpcode()) {
6430 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6431 Results.push_back(Res);
6432 return;
6433 }
6435 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6436 Results.push_back(Res);
6437 return;
6438 }
6440 unsigned IID = N->getConstantOperandVal(0);
6441 switch (IID) {
6442 case Intrinsic::amdgcn_make_buffer_rsrc:
6443 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6444 return;
6445 case Intrinsic::amdgcn_cvt_pkrtz: {
6446 SDValue Src0 = N->getOperand(1);
6447 SDValue Src1 = N->getOperand(2);
6448 SDLoc SL(N);
6449 SDValue Cvt =
6450 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6451 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6452 return;
6453 }
6454 case Intrinsic::amdgcn_cvt_pknorm_i16:
6455 case Intrinsic::amdgcn_cvt_pknorm_u16:
6456 case Intrinsic::amdgcn_cvt_pk_i16:
6457 case Intrinsic::amdgcn_cvt_pk_u16: {
6458 SDValue Src0 = N->getOperand(1);
6459 SDValue Src1 = N->getOperand(2);
6460 SDLoc SL(N);
6461 unsigned Opcode;
6462
6463 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6465 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6467 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6469 else
6471
6472 EVT VT = N->getValueType(0);
6473 if (isTypeLegal(VT))
6474 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6475 else {
6476 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6477 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6478 }
6479 return;
6480 }
6481 case Intrinsic::amdgcn_s_buffer_load: {
6482 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6483 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6484 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6485 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6486 // s_buffer_load_i8.
6487 if (!Subtarget->hasScalarSubwordLoads())
6488 return;
6489 SDValue Op = SDValue(N, 0);
6490 SDValue Rsrc = Op.getOperand(1);
6491 SDValue Offset = Op.getOperand(2);
6492 SDValue CachePolicy = Op.getOperand(3);
6493 EVT VT = Op.getValueType();
6494 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6495 SDLoc DL(Op);
6497 const DataLayout &DataLayout = DAG.getDataLayout();
6498 Align Alignment =
6504 VT.getStoreSize(), Alignment);
6505 SDValue LoadVal;
6506 if (!Offset->isDivergent()) {
6507 SDValue Ops[] = {Rsrc, // source register
6508 Offset, CachePolicy};
6509 SDValue BufferLoad =
6511 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6512 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6513 } else {
6514 SDValue Ops[] = {
6515 DAG.getEntryNode(), // Chain
6516 Rsrc, // rsrc
6517 DAG.getConstant(0, DL, MVT::i32), // vindex
6518 {}, // voffset
6519 {}, // soffset
6520 {}, // offset
6521 CachePolicy, // cachepolicy
6522 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6523 };
6524 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6525 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6526 }
6527 Results.push_back(LoadVal);
6528 return;
6529 }
6530 }
6531 break;
6532 }
6534 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6535 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6536 // FIXME: Hacky
6537 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6538 Results.push_back(Res.getOperand(I));
6539 }
6540 } else {
6541 Results.push_back(Res);
6542 Results.push_back(Res.getValue(1));
6543 }
6544 return;
6545 }
6546
6547 break;
6548 }
6549 case ISD::SELECT: {
6550 SDLoc SL(N);
6551 EVT VT = N->getValueType(0);
6552 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6553 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6554 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6555
6556 EVT SelectVT = NewVT;
6557 if (NewVT.bitsLT(MVT::i32)) {
6558 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6559 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6560 SelectVT = MVT::i32;
6561 }
6562
6563 SDValue NewSelect =
6564 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6565
6566 if (NewVT != SelectVT)
6567 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6568 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6569 return;
6570 }
6571 case ISD::FNEG: {
6572 if (N->getValueType(0) != MVT::v2f16)
6573 break;
6574
6575 SDLoc SL(N);
6576 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6577
6578 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6579 DAG.getConstant(0x80008000, SL, MVT::i32));
6580 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6581 return;
6582 }
6583 case ISD::FABS: {
6584 if (N->getValueType(0) != MVT::v2f16)
6585 break;
6586
6587 SDLoc SL(N);
6588 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6589
6590 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6591 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6592 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6593 return;
6594 }
6595 case ISD::FSQRT: {
6596 if (N->getValueType(0) != MVT::f16)
6597 break;
6598 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6599 break;
6600 }
6601 default:
6603 break;
6604 }
6605}
6606
6607/// Helper function for LowerBRCOND
6608static SDNode *findUser(SDValue Value, unsigned Opcode) {
6609
6610 for (SDUse &U : Value->uses()) {
6611 if (U.get() != Value)
6612 continue;
6613
6614 if (U.getUser()->getOpcode() == Opcode)
6615 return U.getUser();
6616 }
6617 return nullptr;
6618}
6619
6620unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6621 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6622 switch (Intr->getConstantOperandVal(1)) {
6623 case Intrinsic::amdgcn_if:
6624 return AMDGPUISD::IF;
6625 case Intrinsic::amdgcn_else:
6626 return AMDGPUISD::ELSE;
6627 case Intrinsic::amdgcn_loop:
6628 return AMDGPUISD::LOOP;
6629 case Intrinsic::amdgcn_end_cf:
6630 llvm_unreachable("should not occur");
6631 default:
6632 return 0;
6633 }
6634 }
6635
6636 // break, if_break, else_break are all only used as inputs to loop, not
6637 // directly as branch conditions.
6638 return 0;
6639}
6640
6642 const Triple &TT = getTargetMachine().getTargetTriple();
6646}
6647
6649 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6650 return false;
6651
6652 // FIXME: Either avoid relying on address space here or change the default
6653 // address space for functions to avoid the explicit check.
6654 return (GV->getValueType()->isFunctionTy() ||
6657}
6658
6660 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6661}
6662
6664 if (!GV->hasExternalLinkage())
6665 return true;
6666
6667 const auto OS = getTargetMachine().getTargetTriple().getOS();
6668 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6669}
6670
6671/// This transforms the control flow intrinsics to get the branch destination as
6672/// last parameter, also switches branch target with BR if the need arise
6673SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6674 SDLoc DL(BRCOND);
6675
6676 SDNode *Intr = BRCOND.getOperand(1).getNode();
6677 SDValue Target = BRCOND.getOperand(2);
6678 SDNode *BR = nullptr;
6679 SDNode *SetCC = nullptr;
6680
6681 if (Intr->getOpcode() == ISD::SETCC) {
6682 // As long as we negate the condition everything is fine
6683 SetCC = Intr;
6684 Intr = SetCC->getOperand(0).getNode();
6685
6686 } else {
6687 // Get the target from BR if we don't negate the condition
6688 BR = findUser(BRCOND, ISD::BR);
6689 assert(BR && "brcond missing unconditional branch user");
6690 Target = BR->getOperand(1);
6691 }
6692
6693 unsigned CFNode = isCFIntrinsic(Intr);
6694 if (CFNode == 0) {
6695 // This is a uniform branch so we don't need to legalize.
6696 return BRCOND;
6697 }
6698
6699 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6700 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6701
6702 assert(!SetCC ||
6703 (SetCC->getConstantOperandVal(1) == 1 &&
6704 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6705 ISD::SETNE));
6706
6707 // operands of the new intrinsic call
6709 if (HaveChain)
6710 Ops.push_back(BRCOND.getOperand(0));
6711
6712 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6713 Ops.push_back(Target);
6714
6715 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6716
6717 // build the new intrinsic call
6718 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6719
6720 if (!HaveChain) {
6721 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6722
6723 Result = DAG.getMergeValues(Ops, DL).getNode();
6724 }
6725
6726 if (BR) {
6727 // Give the branch instruction our target
6728 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6729 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6730 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6731 }
6732
6733 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6734
6735 // Copy the intrinsic results to registers
6736 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6738 if (!CopyToReg)
6739 continue;
6740
6741 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6742 SDValue(Result, i - 1), SDValue());
6743
6744 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6745 }
6746
6747 // Remove the old intrinsic from the chain
6748 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6749 Intr->getOperand(0));
6750
6751 return Chain;
6752}
6753
6754SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6755 MVT VT = Op.getSimpleValueType();
6756 SDLoc DL(Op);
6757 // Checking the depth
6758 if (Op.getConstantOperandVal(0) != 0)
6759 return DAG.getConstant(0, DL, VT);
6760
6763 // Check for kernel and shader functions
6764 if (Info->isEntryFunction())
6765 return DAG.getConstant(0, DL, VT);
6766
6767 MachineFrameInfo &MFI = MF.getFrameInfo();
6768 // There is a call to @llvm.returnaddress in this function
6769 MFI.setReturnAddressIsTaken(true);
6770
6772 // Get the return address reg and mark it as an implicit live-in
6773 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6774 getRegClassFor(VT, Op.getNode()->isDivergent()));
6775
6776 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6777}
6778
6779SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6780 const SDLoc &DL, EVT VT) const {
6781 return Op.getValueType().bitsLE(VT)
6782 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6783 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6784 DAG.getTargetConstant(0, DL, MVT::i32));
6785}
6786
6787SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6788 assert(Op.getValueType() == MVT::f16 &&
6789 "Do not know how to custom lower FP_ROUND for non-f16 type");
6790
6791 SDValue Src = Op.getOperand(0);
6792 EVT SrcVT = Src.getValueType();
6793 if (SrcVT != MVT::f64)
6794 return Op;
6795
6796 // TODO: Handle strictfp
6797 if (Op.getOpcode() != ISD::FP_ROUND)
6798 return Op;
6799
6800 SDLoc DL(Op);
6801
6802 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6803 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6804 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6805}
6806
6807SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6808 SelectionDAG &DAG) const {
6809 EVT VT = Op.getValueType();
6810 const MachineFunction &MF = DAG.getMachineFunction();
6812 bool IsIEEEMode = Info->getMode().IEEE;
6813
6814 // FIXME: Assert during selection that this is only selected for
6815 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6816 // mode functions, but this happens to be OK since it's only done in cases
6817 // where there is known no sNaN.
6818 if (IsIEEEMode)
6819 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6820
6821 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6822 VT == MVT::v16bf16)
6823 return splitBinaryVectorOp(Op, DAG);
6824 return Op;
6825}
6826
6827SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6828 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6829 EVT VT = Op.getValueType();
6830 assert(VT == MVT::f16);
6831
6832 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6833 EVT ExpVT = Exp.getValueType();
6834 if (ExpVT == MVT::i16)
6835 return Op;
6836
6837 SDLoc DL(Op);
6838
6839 // Correct the exponent type for f16 to i16.
6840 // Clamp the range of the exponent to the instruction's range.
6841
6842 // TODO: This should be a generic narrowing legalization, and can easily be
6843 // for GlobalISel.
6844
6845 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6846 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6847
6848 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6849 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6850
6851 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6852
6853 if (IsStrict) {
6854 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6855 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6856 }
6857
6858 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6859}
6860
6862 switch (Op->getOpcode()) {
6863 case ISD::SRA:
6864 case ISD::SMIN:
6865 case ISD::SMAX:
6866 return ISD::SIGN_EXTEND;
6867 case ISD::SRL:
6868 case ISD::UMIN:
6869 case ISD::UMAX:
6870 return ISD::ZERO_EXTEND;
6871 case ISD::ADD:
6872 case ISD::SUB:
6873 case ISD::AND:
6874 case ISD::OR:
6875 case ISD::XOR:
6876 case ISD::SHL:
6877 case ISD::SELECT:
6878 case ISD::MUL:
6879 // operation result won't be influenced by garbage high bits.
6880 // TODO: are all of those cases correct, and are there more?
6881 return ISD::ANY_EXTEND;
6882 case ISD::SETCC: {
6883 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6885 }
6886 default:
6887 llvm_unreachable("unexpected opcode!");
6888 }
6889}
6890
6891SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6892 DAGCombinerInfo &DCI) const {
6893 const unsigned Opc = Op.getOpcode();
6894 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6895 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6896 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6897 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6898 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6899
6900 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6901 : Op->getOperand(0).getValueType();
6902 auto ExtTy = OpTy.changeElementType(MVT::i32);
6903
6904 if (DCI.isBeforeLegalizeOps() ||
6905 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6906 return SDValue();
6907
6908 auto &DAG = DCI.DAG;
6909
6910 SDLoc DL(Op);
6911 SDValue LHS;
6912 SDValue RHS;
6913 if (Opc == ISD::SELECT) {
6914 LHS = Op->getOperand(1);
6915 RHS = Op->getOperand(2);
6916 } else {
6917 LHS = Op->getOperand(0);
6918 RHS = Op->getOperand(1);
6919 }
6920
6921 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6922 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6923
6924 // Special case: for shifts, the RHS always needs a zext.
6925 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6926 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6927 else
6928 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6929
6930 // setcc always return i1/i1 vec so no need to truncate after.
6931 if (Opc == ISD::SETCC) {
6932 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6933 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6934 }
6935
6936 // For other ops, we extend the operation's return type as well so we need to
6937 // truncate back to the original type.
6938 SDValue NewVal;
6939 if (Opc == ISD::SELECT)
6940 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6941 else
6942 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6943
6944 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6945}
6946
6947// Custom lowering for vector multiplications and s_mul_u64.
6948SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6949 EVT VT = Op.getValueType();
6950
6951 // Split vector operands.
6952 if (VT.isVector())
6953 return splitBinaryVectorOp(Op, DAG);
6954
6955 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6956
6957 // There are four ways to lower s_mul_u64:
6958 //
6959 // 1. If all the operands are uniform, then we lower it as it is.
6960 //
6961 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6962 // multiplications because there is not a vector equivalent of s_mul_u64.
6963 //
6964 // 3. If the cost model decides that it is more efficient to use vector
6965 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6966 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6967 //
6968 // 4. If the cost model decides to use vector registers and both of the
6969 // operands are zero-extended/sign-extended from 32-bits, then we split the
6970 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6971 // possible to check if the operands are zero-extended or sign-extended in
6972 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6973 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6974 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6975 // If the cost model decides that we have to use vector registers, then
6976 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6977 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6978 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6979 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6980 // SIInstrInfo.cpp .
6981
6982 if (Op->isDivergent())
6983 return SDValue();
6984
6985 SDValue Op0 = Op.getOperand(0);
6986 SDValue Op1 = Op.getOperand(1);
6987 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6988 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6989 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6990 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6991 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6992 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6993 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6994 SDLoc SL(Op);
6995 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6996 return SDValue(
6997 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6998 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6999 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7000 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7001 return SDValue(
7002 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7003 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7004 return Op;
7005}
7006
7007SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7008 EVT VT = Op.getValueType();
7009 SDLoc SL(Op);
7010 SDValue LHS = Op.getOperand(0);
7011 SDValue RHS = Op.getOperand(1);
7012 bool isSigned = Op.getOpcode() == ISD::SMULO;
7013
7014 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7015 const APInt &C = RHSC->getAPIntValue();
7016 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7017 if (C.isPowerOf2()) {
7018 // smulo(x, signed_min) is same as umulo(x, signed_min).
7019 bool UseArithShift = isSigned && !C.isMinSignedValue();
7020 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7021 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7022 SDValue Overflow =
7023 DAG.getSetCC(SL, MVT::i1,
7024 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7025 Result, ShiftAmt),
7026 LHS, ISD::SETNE);
7027 return DAG.getMergeValues({Result, Overflow}, SL);
7028 }
7029 }
7030
7031 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7032 SDValue Top =
7033 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7034
7035 SDValue Sign = isSigned
7036 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7037 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7038 SL, MVT::i32))
7039 : DAG.getConstant(0, SL, VT);
7040 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7041
7042 return DAG.getMergeValues({Result, Overflow}, SL);
7043}
7044
7045SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7046 if (Op->isDivergent()) {
7047 // Select to V_MAD_[IU]64_[IU]32.
7048 return Op;
7049 }
7050 if (Subtarget->hasSMulHi()) {
7051 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7052 return SDValue();
7053 }
7054 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7055 // calculate the high part, so we might as well do the whole thing with
7056 // V_MAD_[IU]64_[IU]32.
7057 return Op;
7058}
7059
7060SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7061 if (!Subtarget->isTrapHandlerEnabled() ||
7063 return lowerTrapEndpgm(Op, DAG);
7064
7065 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7066 : lowerTrapHsaQueuePtr(Op, DAG);
7067}
7068
7069SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7070 SDLoc SL(Op);
7071 SDValue Chain = Op.getOperand(0);
7072 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7073}
7074
7075SDValue
7076SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7077 const SDLoc &DL, Align Alignment,
7078 ImplicitParameter Param) const {
7081 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7083 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7086}
7087
7088SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7089 SelectionDAG &DAG) const {
7090 SDLoc SL(Op);
7091 SDValue Chain = Op.getOperand(0);
7092
7093 SDValue QueuePtr;
7094 // For code object version 5, QueuePtr is passed through implicit kernarg.
7095 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7097 QueuePtr =
7098 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7099 } else {
7102 Register UserSGPR = Info->getQueuePtrUserSGPR();
7103
7104 if (UserSGPR == AMDGPU::NoRegister) {
7105 // We probably are in a function incorrectly marked with
7106 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7107 // trap, so just use a null pointer.
7108 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7109 } else {
7110 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7111 MVT::i64);
7112 }
7113 }
7114
7115 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7116 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7117
7119 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7120 ToReg.getValue(1)};
7121 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7122}
7123
7124SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7125 SDLoc SL(Op);
7126 SDValue Chain = Op.getOperand(0);
7127
7128 // We need to simulate the 's_trap 2' instruction on targets that run in
7129 // PRIV=1 (where it is treated as a nop).
7130 if (Subtarget->hasPrivEnabledTrap2NopBug())
7131 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7132
7134 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7135 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7136}
7137
7138SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7139 SDLoc SL(Op);
7140 SDValue Chain = Op.getOperand(0);
7142
7143 if (!Subtarget->isTrapHandlerEnabled() ||
7146 "debugtrap handler not supported",
7147 Op.getDebugLoc(), DS_Warning);
7148 LLVMContext &Ctx = MF.getFunction().getContext();
7149 Ctx.diagnose(NoTrap);
7150 return Chain;
7151 }
7152
7153 uint64_t TrapID =
7155 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7156 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7157}
7158
7159SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7160 SelectionDAG &DAG) const {
7161 if (Subtarget->hasApertureRegs()) {
7162 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7163 ? AMDGPU::SRC_SHARED_BASE
7164 : AMDGPU::SRC_PRIVATE_BASE;
7165 // Note: this feature (register) is broken. When used as a 32-bit operand,
7166 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7167 // bits.
7168 //
7169 // To work around the issue, directly emit a 64 bit mov from this register
7170 // then extract the high bits. Note that this shouldn't even result in a
7171 // shift being emitted and simply become a pair of registers (e.g.):
7172 // s_mov_b64 s[6:7], src_shared_base
7173 // v_mov_b32_e32 v1, s7
7174 //
7175 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7176 // coalescing would kick in and it would think it's okay to use the "HI"
7177 // subregister directly (instead of extracting the HI 32 bits) which is an
7178 // artificial (unusable) register.
7179 // Register TableGen definitions would need an overhaul to get rid of the
7180 // artificial "HI" aperture registers and prevent this kind of issue from
7181 // happening.
7182 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7183 DAG.getRegister(ApertureRegNo, MVT::i64));
7184 return DAG.getNode(
7185 ISD::TRUNCATE, DL, MVT::i32,
7186 DAG.getNode(ISD::SRL, DL, MVT::i64,
7187 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7188 }
7189
7190 // For code object version 5, private_base and shared_base are passed through
7191 // implicit kernargs.
7192 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7196 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7197 }
7198
7201 Register UserSGPR = Info->getQueuePtrUserSGPR();
7202 if (UserSGPR == AMDGPU::NoRegister) {
7203 // We probably are in a function incorrectly marked with
7204 // amdgpu-no-queue-ptr. This is undefined.
7205 return DAG.getUNDEF(MVT::i32);
7206 }
7207
7208 SDValue QueuePtr =
7209 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7210
7211 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7212 // private_segment_aperture_base_hi.
7213 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7214
7215 SDValue Ptr =
7216 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7217
7218 // TODO: Use custom target PseudoSourceValue.
7219 // TODO: We should use the value from the IR intrinsic call, but it might not
7220 // be available and how do we get it?
7222 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7223 commonAlignment(Align(64), StructOffset),
7226}
7227
7228/// Return true if the value is a known valid address, such that a null check is
7229/// not necessary.
7231 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7232 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7233 isa<BasicBlockSDNode>(Val))
7234 return true;
7235
7236 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7237 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7238
7239 // TODO: Search through arithmetic, handle arguments and loads
7240 // marked nonnull.
7241 return false;
7242}
7243
7244SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7245 SelectionDAG &DAG) const {
7246 SDLoc SL(Op);
7247
7248 const AMDGPUTargetMachine &TM =
7249 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7250
7251 unsigned DestAS, SrcAS;
7252 SDValue Src;
7253 bool IsNonNull = false;
7254 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7255 SrcAS = ASC->getSrcAddressSpace();
7256 Src = ASC->getOperand(0);
7257 DestAS = ASC->getDestAddressSpace();
7258 } else {
7259 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7260 Op.getConstantOperandVal(0) ==
7261 Intrinsic::amdgcn_addrspacecast_nonnull);
7262 Src = Op->getOperand(1);
7263 SrcAS = Op->getConstantOperandVal(2);
7264 DestAS = Op->getConstantOperandVal(3);
7265 IsNonNull = true;
7266 }
7267
7268 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7269
7270 // flat -> local/private
7271 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7272 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7273 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7274 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7275
7276 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7277 return Ptr;
7278
7279 unsigned NullVal = TM.getNullPointerValue(DestAS);
7280 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7281 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7282
7283 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7284 SegmentNullPtr);
7285 }
7286 }
7287
7288 // local/private -> flat
7289 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7290 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7291 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7292
7293 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7294 SDValue CvtPtr =
7295 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7296 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7297
7298 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7299 return CvtPtr;
7300
7301 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7302 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7303
7304 SDValue NonNull =
7305 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7306
7307 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7308 FlatNullPtr);
7309 }
7310 }
7311
7312 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7313 Op.getValueType() == MVT::i64) {
7316 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7317 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7318 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7319 }
7320
7321 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7322 Src.getValueType() == MVT::i64)
7323 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7324
7325 // global <-> flat are no-ops and never emitted.
7326
7327 const MachineFunction &MF = DAG.getMachineFunction();
7328 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7329 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7330 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7331
7332 return DAG.getUNDEF(Op->getValueType(0));
7333}
7334
7335// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7336// the small vector and inserting them into the big vector. That is better than
7337// the default expansion of doing it via a stack slot. Even though the use of
7338// the stack slot would be optimized away afterwards, the stack slot itself
7339// remains.
7340SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7341 SelectionDAG &DAG) const {
7342 SDValue Vec = Op.getOperand(0);
7343 SDValue Ins = Op.getOperand(1);
7344 SDValue Idx = Op.getOperand(2);
7345 EVT VecVT = Vec.getValueType();
7346 EVT InsVT = Ins.getValueType();
7347 EVT EltVT = VecVT.getVectorElementType();
7348 unsigned InsNumElts = InsVT.getVectorNumElements();
7349 unsigned IdxVal = Idx->getAsZExtVal();
7350 SDLoc SL(Op);
7351
7352 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7353 // Insert 32-bit registers at a time.
7354 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7355
7356 unsigned VecNumElts = VecVT.getVectorNumElements();
7357 EVT NewVecVT =
7358 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7359 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7361 MVT::i32, InsNumElts / 2);
7362
7363 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7364 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7365
7366 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7367 SDValue Elt;
7368 if (InsNumElts == 2) {
7369 Elt = Ins;
7370 } else {
7371 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7372 DAG.getConstant(I, SL, MVT::i32));
7373 }
7374 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7375 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7376 }
7377
7378 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7379 }
7380
7381 for (unsigned I = 0; I != InsNumElts; ++I) {
7382 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7383 DAG.getConstant(I, SL, MVT::i32));
7384 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7385 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7386 }
7387 return Vec;
7388}
7389
7390SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7391 SelectionDAG &DAG) const {
7392 SDValue Vec = Op.getOperand(0);
7393 SDValue InsVal = Op.getOperand(1);
7394 SDValue Idx = Op.getOperand(2);
7395 EVT VecVT = Vec.getValueType();
7396 EVT EltVT = VecVT.getVectorElementType();
7397 unsigned VecSize = VecVT.getSizeInBits();
7398 unsigned EltSize = EltVT.getSizeInBits();
7399 SDLoc SL(Op);
7400
7401 // Specially handle the case of v4i16 with static indexing.
7402 unsigned NumElts = VecVT.getVectorNumElements();
7403 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7404 if (NumElts == 4 && EltSize == 16 && KIdx) {
7405 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7406
7407 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7408 DAG.getConstant(0, SL, MVT::i32));
7409 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7410 DAG.getConstant(1, SL, MVT::i32));
7411
7412 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7413 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7414
7415 unsigned Idx = KIdx->getZExtValue();
7416 bool InsertLo = Idx < 2;
7417 SDValue InsHalf = DAG.getNode(
7418 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7419 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7420 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7421
7422 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7423
7424 SDValue Concat =
7425 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7426 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7427
7428 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7429 }
7430
7431 // Static indexing does not lower to stack access, and hence there is no need
7432 // for special custom lowering to avoid stack access.
7433 if (isa<ConstantSDNode>(Idx))
7434 return SDValue();
7435
7436 // Avoid stack access for dynamic indexing by custom lowering to
7437 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7438
7439 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7440
7441 MVT IntVT = MVT::getIntegerVT(VecSize);
7442
7443 // Convert vector index to bit-index and get the required bit mask.
7444 assert(isPowerOf2_32(EltSize));
7445 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7446 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7447 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7448 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7449 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7450
7451 // 1. Create a congruent vector with the target value in each element.
7452 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7453 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7454
7455 // 2. Mask off all other indices except the required index within (1).
7456 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7457
7458 // 3. Mask off the required index within the target vector.
7459 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7460 SDValue RHS =
7461 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7462
7463 // 4. Get (2) and (3) ORed into the target vector.
7464 SDValue BFI =
7465 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7466
7467 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7468}
7469
7470SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7471 SelectionDAG &DAG) const {
7472 SDLoc SL(Op);
7473
7474 EVT ResultVT = Op.getValueType();
7475 SDValue Vec = Op.getOperand(0);
7476 SDValue Idx = Op.getOperand(1);
7477 EVT VecVT = Vec.getValueType();
7478 unsigned VecSize = VecVT.getSizeInBits();
7479 EVT EltVT = VecVT.getVectorElementType();
7480
7481 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7482
7483 // Make sure we do any optimizations that will make it easier to fold
7484 // source modifiers before obscuring it with bit operations.
7485
7486 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7487 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7488 return Combined;
7489
7490 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7491 SDValue Lo, Hi;
7492 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7493
7494 if (VecSize == 128) {
7495 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7496 Lo = DAG.getBitcast(LoVT,
7497 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7498 DAG.getConstant(0, SL, MVT::i32)));
7499 Hi = DAG.getBitcast(HiVT,
7500 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7501 DAG.getConstant(1, SL, MVT::i32)));
7502 } else if (VecSize == 256) {
7503 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7504 SDValue Parts[4];
7505 for (unsigned P = 0; P < 4; ++P) {
7506 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7507 DAG.getConstant(P, SL, MVT::i32));
7508 }
7509
7510 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7511 Parts[0], Parts[1]));
7512 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7513 Parts[2], Parts[3]));
7514 } else {
7515 assert(VecSize == 512);
7516
7517 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7518 SDValue Parts[8];
7519 for (unsigned P = 0; P < 8; ++P) {
7520 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7521 DAG.getConstant(P, SL, MVT::i32));
7522 }
7523
7524 Lo = DAG.getBitcast(LoVT,
7525 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7526 Parts[0], Parts[1], Parts[2], Parts[3]));
7527 Hi = DAG.getBitcast(HiVT,
7528 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7529 Parts[4], Parts[5], Parts[6], Parts[7]));
7530 }
7531
7532 EVT IdxVT = Idx.getValueType();
7533 unsigned NElem = VecVT.getVectorNumElements();
7534 assert(isPowerOf2_32(NElem));
7535 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7536 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7537 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7538 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7539 }
7540
7541 assert(VecSize <= 64);
7542
7543 MVT IntVT = MVT::getIntegerVT(VecSize);
7544
7545 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7546 SDValue VecBC = peekThroughBitcasts(Vec);
7547 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7548 SDValue Src = VecBC.getOperand(0);
7549 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7550 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7551 }
7552
7553 unsigned EltSize = EltVT.getSizeInBits();
7554 assert(isPowerOf2_32(EltSize));
7555
7556 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7557
7558 // Convert vector index to bit-index (* EltSize)
7559 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7560
7561 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7562 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7563
7564 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7565 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7566 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7567 }
7568
7569 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7570}
7571
7572static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7573 assert(Elt % 2 == 0);
7574 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7575}
7576
7577SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7578 SelectionDAG &DAG) const {
7579 SDLoc SL(Op);
7580 EVT ResultVT = Op.getValueType();
7581 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7582 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7583 MVT PackVT = MVT::getVectorVT(EltVT, 2);
7584 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7585
7586 // vector_shuffle <0,1,6,7> lhs, rhs
7587 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7588 //
7589 // vector_shuffle <6,7,2,3> lhs, rhs
7590 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7591 //
7592 // vector_shuffle <6,7,0,1> lhs, rhs
7593 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7594
7595 // Avoid scalarizing when both halves are reading from consecutive elements.
7597 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7598 if (elementPairIsContiguous(SVN->getMask(), I)) {
7599 const int Idx = SVN->getMaskElt(I);
7600 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7601 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7602 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7603 SVN->getOperand(VecIdx),
7604 DAG.getConstant(EltIdx, SL, MVT::i32));
7605 Pieces.push_back(SubVec);
7606 } else {
7607 const int Idx0 = SVN->getMaskElt(I);
7608 const int Idx1 = SVN->getMaskElt(I + 1);
7609 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7610 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7611 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7612 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7613
7614 SDValue Vec0 = SVN->getOperand(VecIdx0);
7615 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7616 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7617
7618 SDValue Vec1 = SVN->getOperand(VecIdx1);
7619 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7620 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7621 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7622 }
7623 }
7624
7625 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7626}
7627
7628SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7629 SelectionDAG &DAG) const {
7630 SDValue SVal = Op.getOperand(0);
7631 EVT ResultVT = Op.getValueType();
7632 EVT SValVT = SVal.getValueType();
7633 SDValue UndefVal = DAG.getUNDEF(SValVT);
7634 SDLoc SL(Op);
7635
7637 VElts.push_back(SVal);
7638 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7639 VElts.push_back(UndefVal);
7640
7641 return DAG.getBuildVector(ResultVT, SL, VElts);
7642}
7643
7644SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7645 SelectionDAG &DAG) const {
7646 SDLoc SL(Op);
7647 EVT VT = Op.getValueType();
7648
7649 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7650 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7651
7652 SDValue Lo = Op.getOperand(0);
7653 SDValue Hi = Op.getOperand(1);
7654
7655 // Avoid adding defined bits with the zero_extend.
7656 if (Hi.isUndef()) {
7657 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7658 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7659 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7660 }
7661
7662 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7663 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7664
7665 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7666 DAG.getConstant(16, SL, MVT::i32));
7667 if (Lo.isUndef())
7668 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7669
7670 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7671 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7672
7673 SDValue Or =
7674 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7675 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7676 }
7677
7678 // Split into 2-element chunks.
7679 const unsigned NumParts = VT.getVectorNumElements() / 2;
7681 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7682
7684 for (unsigned P = 0; P < NumParts; ++P) {
7685 SDValue Vec = DAG.getBuildVector(
7686 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7687 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7688 }
7689
7690 SDValue Blend =
7691 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7692 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7693}
7694
7696 const GlobalAddressSDNode *GA) const {
7697 // OSes that use ELF REL relocations (instead of RELA) can only store a
7698 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7699 // which can create arbitrary 64-bit addends. (This is only a problem for
7700 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7701 // the high 32 bits of the addend.)
7702 //
7703 // This should be kept in sync with how HasRelocationAddend is initialized in
7704 // the constructor of ELFAMDGPUAsmBackend.
7705 if (!Subtarget->isAmdHsaOS())
7706 return false;
7707
7708 // We can fold offsets for anything that doesn't require a GOT relocation.
7709 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7713}
7714
7715static SDValue
7717 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7718 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7719 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7720 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7721 // lowered to the following code sequence:
7722 //
7723 // For constant address space:
7724 // s_getpc_b64 s[0:1]
7725 // s_add_u32 s0, s0, $symbol
7726 // s_addc_u32 s1, s1, 0
7727 //
7728 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7729 // a fixup or relocation is emitted to replace $symbol with a literal
7730 // constant, which is a pc-relative offset from the encoding of the $symbol
7731 // operand to the global variable.
7732 //
7733 // For global address space:
7734 // s_getpc_b64 s[0:1]
7735 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7736 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7737 //
7738 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7739 // fixups or relocations are emitted to replace $symbol@*@lo and
7740 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7741 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7742 // operand to the global variable.
7743 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7744 SDValue PtrHi;
7745 if (GAFlags == SIInstrInfo::MO_NONE)
7746 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7747 else
7748 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7749 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7750}
7751
7752SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7753 SDValue Op,
7754 SelectionDAG &DAG) const {
7755 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7756 SDLoc DL(GSD);
7757 EVT PtrVT = Op.getValueType();
7758
7759 const GlobalValue *GV = GSD->getGlobal();
7765 GV->hasExternalLinkage()) {
7766 Type *Ty = GV->getValueType();
7767 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7768 // zero-sized type in other languages to declare the dynamic shared
7769 // memory which size is not known at the compile time. They will be
7770 // allocated by the runtime and placed directly after the static
7771 // allocated ones. They all share the same offset.
7772 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7773 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7774 // Adjust alignment for that dynamic shared memory array.
7776 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7777 MFI->setUsesDynamicLDS(true);
7778 return SDValue(
7779 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7780 }
7781 }
7783 }
7784
7786 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7788 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7789 }
7790
7791 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7792 SDValue AddrLo = DAG.getTargetGlobalAddress(
7793 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7794 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7795
7796 SDValue AddrHi = DAG.getTargetGlobalAddress(
7797 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7798 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7799
7800 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7801 }
7802
7803 if (shouldEmitFixup(GV))
7804 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7805
7806 if (shouldEmitPCReloc(GV))
7807 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7809
7810 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7812
7813 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7815 const DataLayout &DataLayout = DAG.getDataLayout();
7816 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7817 MachinePointerInfo PtrInfo =
7819
7820 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7823}
7824
7826 const SDLoc &DL, SDValue V) const {
7827 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7828 // the destination register.
7829 //
7830 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7831 // so we will end up with redundant moves to m0.
7832 //
7833 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7834
7835 // A Null SDValue creates a glue result.
7836 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7837 V, Chain);
7838 return SDValue(M0, 0);
7839}
7840
7841SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7842 MVT VT,
7843 unsigned Offset) const {
7844 SDLoc SL(Op);
7845 SDValue Param = lowerKernargMemParameter(
7846 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7847 // The local size values will have the hi 16-bits as zero.
7848 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7849 DAG.getValueType(VT));
7850}
7851
7853 EVT VT) {
7855 "non-hsa intrinsic with hsa target",
7856 DL.getDebugLoc());
7857 DAG.getContext()->diagnose(BadIntrin);
7858 return DAG.getUNDEF(VT);
7859}
7860
7862 EVT VT) {
7864 "intrinsic not supported on subtarget",
7865 DL.getDebugLoc());
7866 DAG.getContext()->diagnose(BadIntrin);
7867 return DAG.getUNDEF(VT);
7868}
7869
7871 ArrayRef<SDValue> Elts) {
7872 assert(!Elts.empty());
7873 MVT Type;
7874 unsigned NumElts = Elts.size();
7875
7876 if (NumElts <= 12) {
7877 Type = MVT::getVectorVT(MVT::f32, NumElts);
7878 } else {
7879 assert(Elts.size() <= 16);
7880 Type = MVT::v16f32;
7881 NumElts = 16;
7882 }
7883
7884 SmallVector<SDValue, 16> VecElts(NumElts);
7885 for (unsigned i = 0; i < Elts.size(); ++i) {
7886 SDValue Elt = Elts[i];
7887 if (Elt.getValueType() != MVT::f32)
7888 Elt = DAG.getBitcast(MVT::f32, Elt);
7889 VecElts[i] = Elt;
7890 }
7891 for (unsigned i = Elts.size(); i < NumElts; ++i)
7892 VecElts[i] = DAG.getUNDEF(MVT::f32);
7893
7894 if (NumElts == 1)
7895 return VecElts[0];
7896 return DAG.getBuildVector(Type, DL, VecElts);
7897}
7898
7899static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7900 SDValue Src, int ExtraElts) {
7901 EVT SrcVT = Src.getValueType();
7902
7904
7905 if (SrcVT.isVector())
7906 DAG.ExtractVectorElements(Src, Elts);
7907 else
7908 Elts.push_back(Src);
7909
7910 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7911 while (ExtraElts--)
7912 Elts.push_back(Undef);
7913
7914 return DAG.getBuildVector(CastVT, DL, Elts);
7915}
7916
7917// Re-construct the required return value for a image load intrinsic.
7918// This is more complicated due to the optional use TexFailCtrl which means the
7919// required return type is an aggregate
7921 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7922 bool Unpacked, bool IsD16, int DMaskPop,
7923 int NumVDataDwords, bool IsAtomicPacked16Bit,
7924 const SDLoc &DL) {
7925 // Determine the required return type. This is the same regardless of
7926 // IsTexFail flag
7927 EVT ReqRetVT = ResultTypes[0];
7928 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7929 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7930 ? (ReqRetNumElts + 1) / 2
7931 : ReqRetNumElts;
7932
7933 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7934
7935 MVT DataDwordVT =
7936 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7937
7938 MVT MaskPopVT =
7939 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7940
7941 SDValue Data(Result, 0);
7942 SDValue TexFail;
7943
7944 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7945 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7946 if (MaskPopVT.isVector()) {
7947 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7948 SDValue(Result, 0), ZeroIdx);
7949 } else {
7950 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7951 SDValue(Result, 0), ZeroIdx);
7952 }
7953 }
7954
7955 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7956 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7957 NumDataDwords - MaskPopDwords);
7958
7959 if (IsD16)
7960 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7961
7962 EVT LegalReqRetVT = ReqRetVT;
7963 if (!ReqRetVT.isVector()) {
7964 if (!Data.getValueType().isInteger())
7965 Data = DAG.getNode(ISD::BITCAST, DL,
7966 Data.getValueType().changeTypeToInteger(), Data);
7967 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7968 } else {
7969 // We need to widen the return vector to a legal type
7970 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7971 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7972 LegalReqRetVT =
7974 ReqRetVT.getVectorNumElements() + 1);
7975 }
7976 }
7977 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7978
7979 if (IsTexFail) {
7980 TexFail =
7981 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7982 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7983
7984 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7985 }
7986
7987 if (Result->getNumValues() == 1)
7988 return Data;
7989
7990 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7991}
7992
7993static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7994 SDValue *LWE, bool &IsTexFail) {
7995 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7996
7997 uint64_t Value = TexFailCtrlConst->getZExtValue();
7998 if (Value) {
7999 IsTexFail = true;
8000 }
8001
8002 SDLoc DL(TexFailCtrlConst);
8003 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8004 Value &= ~(uint64_t)0x1;
8005 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8006 Value &= ~(uint64_t)0x2;
8007
8008 return Value == 0;
8009}
8010
8012 MVT PackVectorVT,
8013 SmallVectorImpl<SDValue> &PackedAddrs,
8014 unsigned DimIdx, unsigned EndIdx,
8015 unsigned NumGradients) {
8016 SDLoc DL(Op);
8017 for (unsigned I = DimIdx; I < EndIdx; I++) {
8018 SDValue Addr = Op.getOperand(I);
8019
8020 // Gradients are packed with undef for each coordinate.
8021 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8022 // 1D: undef,dx/dh; undef,dx/dv
8023 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8024 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8025 if (((I + 1) >= EndIdx) ||
8026 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8027 I == DimIdx + NumGradients - 1))) {
8028 if (Addr.getValueType() != MVT::i16)
8029 Addr = DAG.getBitcast(MVT::i16, Addr);
8030 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8031 } else {
8032 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8033 I++;
8034 }
8035 Addr = DAG.getBitcast(MVT::f32, Addr);
8036 PackedAddrs.push_back(Addr);
8037 }
8038}
8039
8040SDValue SITargetLowering::lowerImage(SDValue Op,
8042 SelectionDAG &DAG, bool WithChain) const {
8043 SDLoc DL(Op);
8045 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8046 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8048 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8049 unsigned IntrOpcode = Intr->BaseOpcode;
8050 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8051 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8052 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8053
8054 SmallVector<EVT, 3> ResultTypes(Op->values());
8055 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8056 bool IsD16 = false;
8057 bool IsG16 = false;
8058 bool IsA16 = false;
8059 SDValue VData;
8060 int NumVDataDwords = 0;
8061 bool AdjustRetType = false;
8062 bool IsAtomicPacked16Bit = false;
8063
8064 // Offset of intrinsic arguments
8065 const unsigned ArgOffset = WithChain ? 2 : 1;
8066
8067 unsigned DMask;
8068 unsigned DMaskLanes = 0;
8069
8070 if (BaseOpcode->Atomic) {
8071 VData = Op.getOperand(2);
8072
8073 IsAtomicPacked16Bit =
8074 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8075 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8076
8077 bool Is64Bit = VData.getValueSizeInBits() == 64;
8078 if (BaseOpcode->AtomicX2) {
8079 SDValue VData2 = Op.getOperand(3);
8080 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8081 {VData, VData2});
8082 if (Is64Bit)
8083 VData = DAG.getBitcast(MVT::v4i32, VData);
8084
8085 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8086 DMask = Is64Bit ? 0xf : 0x3;
8087 NumVDataDwords = Is64Bit ? 4 : 2;
8088 } else {
8089 DMask = Is64Bit ? 0x3 : 0x1;
8090 NumVDataDwords = Is64Bit ? 2 : 1;
8091 }
8092 } else {
8093 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8094 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8095
8096 if (BaseOpcode->Store) {
8097 VData = Op.getOperand(2);
8098
8099 MVT StoreVT = VData.getSimpleValueType();
8100 if (StoreVT.getScalarType() == MVT::f16) {
8101 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8102 return Op; // D16 is unsupported for this instruction
8103
8104 IsD16 = true;
8105 VData = handleD16VData(VData, DAG, true);
8106 }
8107
8108 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8109 } else if (!BaseOpcode->NoReturn) {
8110 // Work out the num dwords based on the dmask popcount and underlying type
8111 // and whether packing is supported.
8112 MVT LoadVT = ResultTypes[0].getSimpleVT();
8113 if (LoadVT.getScalarType() == MVT::f16) {
8114 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8115 return Op; // D16 is unsupported for this instruction
8116
8117 IsD16 = true;
8118 }
8119
8120 // Confirm that the return type is large enough for the dmask specified
8121 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8122 (!LoadVT.isVector() && DMaskLanes > 1))
8123 return Op;
8124
8125 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8126 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8127 // instructions.
8128 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8129 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8130 NumVDataDwords = (DMaskLanes + 1) / 2;
8131 else
8132 NumVDataDwords = DMaskLanes;
8133
8134 AdjustRetType = true;
8135 }
8136 }
8137
8138 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8140
8141 // Check for 16 bit addresses or derivatives and pack if true.
8142 MVT VAddrVT =
8143 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8144 MVT VAddrScalarVT = VAddrVT.getScalarType();
8145 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8146 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8147
8148 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8149 VAddrScalarVT = VAddrVT.getScalarType();
8150 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8151 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8152
8153 // Push back extra arguments.
8154 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8155 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8156 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8157 // Special handling of bias when A16 is on. Bias is of type half but
8158 // occupies full 32-bit.
8159 SDValue Bias = DAG.getBuildVector(
8160 MVT::v2f16, DL,
8161 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8162 VAddrs.push_back(Bias);
8163 } else {
8164 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8165 "Bias needs to be converted to 16 bit in A16 mode");
8166 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8167 }
8168 }
8169
8170 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8171 // 16 bit gradients are supported, but are tied to the A16 control
8172 // so both gradients and addresses must be 16 bit
8173 LLVM_DEBUG(
8174 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8175 "require 16 bit args for both gradients and addresses");
8176 return Op;
8177 }
8178
8179 if (IsA16) {
8180 if (!ST->hasA16()) {
8181 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8182 "support 16 bit addresses\n");
8183 return Op;
8184 }
8185 }
8186
8187 // We've dealt with incorrect input so we know that if IsA16, IsG16
8188 // are set then we have to compress/pack operands (either address,
8189 // gradient or both)
8190 // In the case where a16 and gradients are tied (no G16 support) then we
8191 // have already verified that both IsA16 and IsG16 are true
8192 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8193 // Activate g16
8194 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8196 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8197 }
8198
8199 // Add gradients (packed or unpacked)
8200 if (IsG16) {
8201 // Pack the gradients
8202 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8203 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8204 ArgOffset + Intr->GradientStart,
8205 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8206 } else {
8207 for (unsigned I = ArgOffset + Intr->GradientStart;
8208 I < ArgOffset + Intr->CoordStart; I++)
8209 VAddrs.push_back(Op.getOperand(I));
8210 }
8211
8212 // Add addresses (packed or unpacked)
8213 if (IsA16) {
8214 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8215 ArgOffset + Intr->CoordStart, VAddrEnd,
8216 0 /* No gradients */);
8217 } else {
8218 // Add uncompressed address
8219 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8220 VAddrs.push_back(Op.getOperand(I));
8221 }
8222
8223 // If the register allocator cannot place the address registers contiguously
8224 // without introducing moves, then using the non-sequential address encoding
8225 // is always preferable, since it saves VALU instructions and is usually a
8226 // wash in terms of code size or even better.
8227 //
8228 // However, we currently have no way of hinting to the register allocator that
8229 // MIMG addresses should be placed contiguously when it is possible to do so,
8230 // so force non-NSA for the common 2-address case as a heuristic.
8231 //
8232 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8233 // allocation when possible.
8234 //
8235 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8236 // set of the remaining addresses.
8237 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8238 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8239 const bool UseNSA = ST->hasNSAEncoding() &&
8240 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8241 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8242 const bool UsePartialNSA =
8243 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8244
8245 SDValue VAddr;
8246 if (UsePartialNSA) {
8247 VAddr = getBuildDwordsVector(DAG, DL,
8248 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8249 } else if (!UseNSA) {
8250 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8251 }
8252
8253 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8254 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8255 SDValue Unorm;
8256 if (!BaseOpcode->Sampler) {
8257 Unorm = True;
8258 } else {
8259 uint64_t UnormConst =
8260 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8261
8262 Unorm = UnormConst ? True : False;
8263 }
8264
8265 SDValue TFE;
8266 SDValue LWE;
8267 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8268 bool IsTexFail = false;
8269 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8270 return Op;
8271
8272 if (IsTexFail) {
8273 if (!DMaskLanes) {
8274 // Expecting to get an error flag since TFC is on - and dmask is 0
8275 // Force dmask to be at least 1 otherwise the instruction will fail
8276 DMask = 0x1;
8277 DMaskLanes = 1;
8278 NumVDataDwords = 1;
8279 }
8280 NumVDataDwords += 1;
8281 AdjustRetType = true;
8282 }
8283
8284 // Has something earlier tagged that the return type needs adjusting
8285 // This happens if the instruction is a load or has set TexFailCtrl flags
8286 if (AdjustRetType) {
8287 // NumVDataDwords reflects the true number of dwords required in the return
8288 // type
8289 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8290 // This is a no-op load. This can be eliminated
8291 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8292 if (isa<MemSDNode>(Op))
8293 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8294 return Undef;
8295 }
8296
8297 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8298 MVT::i32, NumVDataDwords)
8299 : MVT::i32;
8300
8301 ResultTypes[0] = NewVT;
8302 if (ResultTypes.size() == 3) {
8303 // Original result was aggregate type used for TexFailCtrl results
8304 // The actual instruction returns as a vector type which has now been
8305 // created. Remove the aggregate result.
8306 ResultTypes.erase(&ResultTypes[1]);
8307 }
8308 }
8309
8310 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8311 if (BaseOpcode->Atomic)
8312 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8313 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8315 return Op;
8316
8318 if (BaseOpcode->Store || BaseOpcode->Atomic)
8319 Ops.push_back(VData); // vdata
8320 if (UsePartialNSA) {
8321 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8322 Ops.push_back(VAddr);
8323 } else if (UseNSA)
8324 append_range(Ops, VAddrs);
8325 else
8326 Ops.push_back(VAddr);
8327 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8328 EVT RsrcVT = Rsrc.getValueType();
8329 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8330 return Op;
8331 Ops.push_back(Rsrc);
8332 if (BaseOpcode->Sampler) {
8333 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8334 if (Samp.getValueType() != MVT::v4i32)
8335 return Op;
8336 Ops.push_back(Samp);
8337 }
8338 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8339 if (IsGFX10Plus)
8340 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8341 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8342 Ops.push_back(Unorm);
8343 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8344 Ops.push_back(IsA16 && // r128, a16 for gfx9
8345 ST->hasFeature(AMDGPU::FeatureR128A16)
8346 ? True
8347 : False);
8348 if (IsGFX10Plus)
8349 Ops.push_back(IsA16 ? True : False);
8350 if (!Subtarget->hasGFX90AInsts()) {
8351 Ops.push_back(TFE); // tfe
8352 } else if (TFE->getAsZExtVal()) {
8353 report_fatal_error("TFE is not supported on this GPU");
8354 }
8355 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8356 Ops.push_back(LWE); // lwe
8357 if (!IsGFX10Plus)
8358 Ops.push_back(DimInfo->DA ? True : False);
8359 if (BaseOpcode->HasD16)
8360 Ops.push_back(IsD16 ? True : False);
8361 if (isa<MemSDNode>(Op))
8362 Ops.push_back(Op.getOperand(0)); // chain
8363
8364 int NumVAddrDwords =
8365 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8366 int Opcode = -1;
8367
8368 if (IsGFX12Plus) {
8369 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8370 NumVDataDwords, NumVAddrDwords);
8371 } else if (IsGFX11Plus) {
8372 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8373 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8374 : AMDGPU::MIMGEncGfx11Default,
8375 NumVDataDwords, NumVAddrDwords);
8376 } else if (IsGFX10Plus) {
8377 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8378 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8379 : AMDGPU::MIMGEncGfx10Default,
8380 NumVDataDwords, NumVAddrDwords);
8381 } else {
8382 if (Subtarget->hasGFX90AInsts()) {
8383 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8384 NumVDataDwords, NumVAddrDwords);
8385 if (Opcode == -1)
8387 "requested image instruction is not supported on this GPU");
8388 }
8389 if (Opcode == -1 &&
8391 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8392 NumVDataDwords, NumVAddrDwords);
8393 if (Opcode == -1)
8394 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8395 NumVDataDwords, NumVAddrDwords);
8396 }
8397 if (Opcode == -1)
8398 return Op;
8399
8400 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8401 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8402 MachineMemOperand *MemRef = MemOp->getMemOperand();
8403 DAG.setNodeMemRefs(NewNode, {MemRef});
8404 }
8405
8406 if (BaseOpcode->AtomicX2) {
8408 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8409 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8410 }
8411 if (BaseOpcode->NoReturn)
8412 return SDValue(NewNode, 0);
8413 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8414 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8415 NumVDataDwords, IsAtomicPacked16Bit, DL);
8416}
8417
8418SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8419 SDValue Offset, SDValue CachePolicy,
8420 SelectionDAG &DAG) const {
8422
8423 const DataLayout &DataLayout = DAG.getDataLayout();
8424 Align Alignment =
8426
8431 VT.getStoreSize(), Alignment);
8432
8433 if (!Offset->isDivergent()) {
8434 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8435
8436 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8437 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8438 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8439 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8440 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8441 SDValue BufferLoad =
8443 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8444 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8445 }
8446
8447 // Widen vec3 load to vec4.
8448 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8449 !Subtarget->hasScalarDwordx3Loads()) {
8450 EVT WidenedVT =
8452 auto WidenedOp = DAG.getMemIntrinsicNode(
8453 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8454 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8455 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8456 DAG.getVectorIdxConstant(0, DL));
8457 return Subvector;
8458 }
8459
8461 DAG.getVTList(VT), Ops, VT, MMO);
8462 }
8463
8464 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8465 // assume that the buffer is unswizzled.
8466 SDValue Ops[] = {
8467 DAG.getEntryNode(), // Chain
8468 Rsrc, // rsrc
8469 DAG.getConstant(0, DL, MVT::i32), // vindex
8470 {}, // voffset
8471 {}, // soffset
8472 {}, // offset
8473 CachePolicy, // cachepolicy
8474 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8475 };
8476 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8477 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8478 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8479 }
8480
8482 unsigned NumLoads = 1;
8483 MVT LoadVT = VT.getSimpleVT();
8484 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8485 assert((LoadVT.getScalarType() == MVT::i32 ||
8486 LoadVT.getScalarType() == MVT::f32));
8487
8488 if (NumElts == 8 || NumElts == 16) {
8489 NumLoads = NumElts / 4;
8490 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8491 }
8492
8493 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8494
8495 // Use the alignment to ensure that the required offsets will fit into the
8496 // immediate offsets.
8497 setBufferOffsets(Offset, DAG, &Ops[3],
8498 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8499
8500 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8501 for (unsigned i = 0; i < NumLoads; ++i) {
8502 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8503 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8504 LoadVT, MMO, DAG));
8505 }
8506
8507 if (NumElts == 8 || NumElts == 16)
8508 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8509
8510 return Loads[0];
8511}
8512
8513SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8514 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8515 if (!Subtarget->hasArchitectedSGPRs())
8516 return {};
8517 SDLoc SL(Op);
8518 MVT VT = MVT::i32;
8519 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8520 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8521 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8522}
8523
8524SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8525 unsigned Dim,
8526 const ArgDescriptor &Arg) const {
8527 SDLoc SL(Op);
8529 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8530 if (MaxID == 0)
8531 return DAG.getConstant(0, SL, MVT::i32);
8532
8533 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8534 SDLoc(DAG.getEntryNode()), Arg);
8535
8536 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8537 // masking operations anyway.
8538 //
8539 // TODO: We could assert the top bit is 0 for the source copy.
8540 if (Arg.isMasked())
8541 return Val;
8542
8543 // Preserve the known bits after expansion to a copy.
8545 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8546 DAG.getValueType(SmallVT));
8547}
8548
8549SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8550 SelectionDAG &DAG) const {
8552 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8553
8554 EVT VT = Op.getValueType();
8555 SDLoc DL(Op);
8556 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8557
8558 // TODO: Should this propagate fast-math-flags?
8559
8560 switch (IntrinsicID) {
8561 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8562 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8563 return emitNonHSAIntrinsicError(DAG, DL, VT);
8564 return getPreloadedValue(DAG, *MFI, VT,
8566 }
8567 case Intrinsic::amdgcn_dispatch_ptr:
8568 case Intrinsic::amdgcn_queue_ptr: {
8569 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8570 DiagnosticInfoUnsupported BadIntrin(
8571 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8572 DL.getDebugLoc());
8573 DAG.getContext()->diagnose(BadIntrin);
8574 return DAG.getUNDEF(VT);
8575 }
8576
8577 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8580 return getPreloadedValue(DAG, *MFI, VT, RegID);
8581 }
8582 case Intrinsic::amdgcn_implicitarg_ptr: {
8583 if (MFI->isEntryFunction())
8584 return getImplicitArgPtr(DAG, DL);
8585 return getPreloadedValue(DAG, *MFI, VT,
8587 }
8588 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8590 // This only makes sense to call in a kernel, so just lower to null.
8591 return DAG.getConstant(0, DL, VT);
8592 }
8593
8594 return getPreloadedValue(DAG, *MFI, VT,
8596 }
8597 case Intrinsic::amdgcn_dispatch_id: {
8598 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8599 }
8600 case Intrinsic::amdgcn_rcp:
8601 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8602 case Intrinsic::amdgcn_rsq:
8603 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8604 case Intrinsic::amdgcn_rsq_legacy:
8606 return emitRemovedIntrinsicError(DAG, DL, VT);
8607 return SDValue();
8608 case Intrinsic::amdgcn_rcp_legacy:
8610 return emitRemovedIntrinsicError(DAG, DL, VT);
8611 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8612 case Intrinsic::amdgcn_rsq_clamp: {
8614 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8615
8616 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8619
8620 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8621 SDValue Tmp =
8622 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8623 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8624 DAG.getConstantFP(Min, DL, VT));
8625 }
8626 case Intrinsic::r600_read_ngroups_x:
8627 if (Subtarget->isAmdHsaOS())
8628 return emitNonHSAIntrinsicError(DAG, DL, VT);
8629
8630 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8632 false);
8633 case Intrinsic::r600_read_ngroups_y:
8634 if (Subtarget->isAmdHsaOS())
8635 return emitNonHSAIntrinsicError(DAG, DL, VT);
8636
8637 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8639 false);
8640 case Intrinsic::r600_read_ngroups_z:
8641 if (Subtarget->isAmdHsaOS())
8642 return emitNonHSAIntrinsicError(DAG, DL, VT);
8643
8644 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8646 false);
8647 case Intrinsic::r600_read_global_size_x:
8648 if (Subtarget->isAmdHsaOS())
8649 return emitNonHSAIntrinsicError(DAG, DL, VT);
8650
8651 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8653 Align(4), false);
8654 case Intrinsic::r600_read_global_size_y:
8655 if (Subtarget->isAmdHsaOS())
8656 return emitNonHSAIntrinsicError(DAG, DL, VT);
8657
8658 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8660 Align(4), false);
8661 case Intrinsic::r600_read_global_size_z:
8662 if (Subtarget->isAmdHsaOS())
8663 return emitNonHSAIntrinsicError(DAG, DL, VT);
8664
8665 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8667 Align(4), false);
8668 case Intrinsic::r600_read_local_size_x:
8669 if (Subtarget->isAmdHsaOS())
8670 return emitNonHSAIntrinsicError(DAG, DL, VT);
8671
8672 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8674 case Intrinsic::r600_read_local_size_y:
8675 if (Subtarget->isAmdHsaOS())
8676 return emitNonHSAIntrinsicError(DAG, DL, VT);
8677
8678 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8680 case Intrinsic::r600_read_local_size_z:
8681 if (Subtarget->isAmdHsaOS())
8682 return emitNonHSAIntrinsicError(DAG, DL, VT);
8683
8684 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8686 case Intrinsic::amdgcn_workgroup_id_x:
8687 return getPreloadedValue(DAG, *MFI, VT,
8689 case Intrinsic::amdgcn_workgroup_id_y:
8690 return getPreloadedValue(DAG, *MFI, VT,
8692 case Intrinsic::amdgcn_workgroup_id_z:
8693 return getPreloadedValue(DAG, *MFI, VT,
8695 case Intrinsic::amdgcn_wave_id:
8696 return lowerWaveID(DAG, Op);
8697 case Intrinsic::amdgcn_lds_kernel_id: {
8698 if (MFI->isEntryFunction())
8699 return getLDSKernelId(DAG, DL);
8700 return getPreloadedValue(DAG, *MFI, VT,
8702 }
8703 case Intrinsic::amdgcn_workitem_id_x:
8704 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8705 case Intrinsic::amdgcn_workitem_id_y:
8706 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8707 case Intrinsic::amdgcn_workitem_id_z:
8708 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8709 case Intrinsic::amdgcn_wavefrontsize:
8711 SDLoc(Op), MVT::i32);
8712 case Intrinsic::amdgcn_s_buffer_load: {
8713 unsigned CPol = Op.getConstantOperandVal(3);
8714 // s_buffer_load, because of how it's optimized, can't be volatile
8715 // so reject ones with the volatile bit set.
8716 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8719 return Op;
8720 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8721 Op.getOperand(3), DAG);
8722 }
8723 case Intrinsic::amdgcn_fdiv_fast:
8724 return lowerFDIV_FAST(Op, DAG);
8725 case Intrinsic::amdgcn_sin:
8726 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8727
8728 case Intrinsic::amdgcn_cos:
8729 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8730
8731 case Intrinsic::amdgcn_mul_u24:
8732 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8733 Op.getOperand(2));
8734 case Intrinsic::amdgcn_mul_i24:
8735 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8736 Op.getOperand(2));
8737
8738 case Intrinsic::amdgcn_log_clamp: {
8740 return SDValue();
8741
8742 return emitRemovedIntrinsicError(DAG, DL, VT);
8743 }
8744 case Intrinsic::amdgcn_fract:
8745 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8746
8747 case Intrinsic::amdgcn_class:
8748 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8749 Op.getOperand(2));
8750 case Intrinsic::amdgcn_div_fmas:
8751 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8752 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8753
8754 case Intrinsic::amdgcn_div_fixup:
8755 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8756 Op.getOperand(2), Op.getOperand(3));
8757
8758 case Intrinsic::amdgcn_div_scale: {
8759 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8760
8761 // Translate to the operands expected by the machine instruction. The
8762 // first parameter must be the same as the first instruction.
8763 SDValue Numerator = Op.getOperand(1);
8764 SDValue Denominator = Op.getOperand(2);
8765
8766 // Note this order is opposite of the machine instruction's operations,
8767 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8768 // intrinsic has the numerator as the first operand to match a normal
8769 // division operation.
8770
8771 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8772
8773 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8774 Denominator, Numerator);
8775 }
8776 case Intrinsic::amdgcn_icmp: {
8777 // There is a Pat that handles this variant, so return it as-is.
8778 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8779 Op.getConstantOperandVal(2) == 0 &&
8780 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8781 return Op;
8782 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8783 }
8784 case Intrinsic::amdgcn_fcmp: {
8785 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8786 }
8787 case Intrinsic::amdgcn_ballot:
8788 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8789 case Intrinsic::amdgcn_fmed3:
8790 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8791 Op.getOperand(2), Op.getOperand(3));
8792 case Intrinsic::amdgcn_fdot2:
8793 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8794 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8795 case Intrinsic::amdgcn_fmul_legacy:
8796 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8797 Op.getOperand(2));
8798 case Intrinsic::amdgcn_sffbh:
8799 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8800 case Intrinsic::amdgcn_sbfe:
8801 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8802 Op.getOperand(2), Op.getOperand(3));
8803 case Intrinsic::amdgcn_ubfe:
8804 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8805 Op.getOperand(2), Op.getOperand(3));
8806 case Intrinsic::amdgcn_cvt_pkrtz:
8807 case Intrinsic::amdgcn_cvt_pknorm_i16:
8808 case Intrinsic::amdgcn_cvt_pknorm_u16:
8809 case Intrinsic::amdgcn_cvt_pk_i16:
8810 case Intrinsic::amdgcn_cvt_pk_u16: {
8811 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8812 EVT VT = Op.getValueType();
8813 unsigned Opcode;
8814
8815 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8817 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8819 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8821 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8823 else
8825
8826 if (isTypeLegal(VT))
8827 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8828
8829 SDValue Node =
8830 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8831 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8832 }
8833 case Intrinsic::amdgcn_fmad_ftz:
8834 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8835 Op.getOperand(2), Op.getOperand(3));
8836
8837 case Intrinsic::amdgcn_if_break:
8838 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8839 Op->getOperand(1), Op->getOperand(2)),
8840 0);
8841
8842 case Intrinsic::amdgcn_groupstaticsize: {
8844 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8845 return Op;
8846
8847 const Module *M = MF.getFunction().getParent();
8848 const GlobalValue *GV =
8849 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8850 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8852 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8853 }
8854 case Intrinsic::amdgcn_is_shared:
8855 case Intrinsic::amdgcn_is_private: {
8856 SDLoc SL(Op);
8857 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8860 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8861 SDValue SrcVec =
8862 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8863
8864 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8865 DAG.getConstant(1, SL, MVT::i32));
8866 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8867 }
8868 case Intrinsic::amdgcn_perm:
8869 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8870 Op.getOperand(2), Op.getOperand(3));
8871 case Intrinsic::amdgcn_reloc_constant: {
8872 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8873 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8874 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8875 auto *RelocSymbol = cast<GlobalVariable>(
8876 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8877 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8879 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8880 }
8881 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8882 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8883 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8884 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8885 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8886 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8887 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8888 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8889 if (Op.getOperand(4).getValueType() == MVT::i32)
8890 return SDValue();
8891
8892 SDLoc SL(Op);
8893 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8894 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8895 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8896 Op.getOperand(3), IndexKeyi32);
8897 }
8898 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8899 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8900 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8901 if (Op.getOperand(6).getValueType() == MVT::i32)
8902 return SDValue();
8903
8904 SDLoc SL(Op);
8905 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8906 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8907 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8908 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8909 IndexKeyi32, Op.getOperand(7)});
8910 }
8911 case Intrinsic::amdgcn_addrspacecast_nonnull:
8912 return lowerADDRSPACECAST(Op, DAG);
8913 case Intrinsic::amdgcn_readlane:
8914 case Intrinsic::amdgcn_readfirstlane:
8915 case Intrinsic::amdgcn_writelane:
8916 case Intrinsic::amdgcn_permlane16:
8917 case Intrinsic::amdgcn_permlanex16:
8918 case Intrinsic::amdgcn_permlane64:
8919 case Intrinsic::amdgcn_set_inactive:
8920 case Intrinsic::amdgcn_set_inactive_chain_arg:
8921 case Intrinsic::amdgcn_mov_dpp8:
8922 case Intrinsic::amdgcn_update_dpp:
8923 return lowerLaneOp(*this, Op.getNode(), DAG);
8924 default:
8925 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8927 return lowerImage(Op, ImageDimIntr, DAG, false);
8928
8929 return Op;
8930 }
8931}
8932
8933// On targets not supporting constant in soffset field, turn zero to
8934// SGPR_NULL to avoid generating an extra s_mov with zero.
8936 const GCNSubtarget *Subtarget) {
8937 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8938 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8939 return SOffset;
8940}
8941
8942SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8943 SelectionDAG &DAG,
8944 unsigned NewOpcode) const {
8945 SDLoc DL(Op);
8946
8947 SDValue VData = Op.getOperand(2);
8948 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8949 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
8950 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8951 SDValue Ops[] = {
8952 Op.getOperand(0), // Chain
8953 VData, // vdata
8954 Rsrc, // rsrc
8955 DAG.getConstant(0, DL, MVT::i32), // vindex
8956 VOffset, // voffset
8957 SOffset, // soffset
8958 Offset, // offset
8959 Op.getOperand(6), // cachepolicy
8960 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8961 };
8962
8963 auto *M = cast<MemSDNode>(Op);
8964
8965 EVT MemVT = VData.getValueType();
8966 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8967 M->getMemOperand());
8968}
8969
8970SDValue
8971SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8972 unsigned NewOpcode) const {
8973 SDLoc DL(Op);
8974
8975 SDValue VData = Op.getOperand(2);
8976 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8977 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
8978 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8979 SDValue Ops[] = {
8980 Op.getOperand(0), // Chain
8981 VData, // vdata
8982 Rsrc, // rsrc
8983 Op.getOperand(4), // vindex
8984 VOffset, // voffset
8985 SOffset, // soffset
8986 Offset, // offset
8987 Op.getOperand(7), // cachepolicy
8988 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8989 };
8990
8991 auto *M = cast<MemSDNode>(Op);
8992
8993 EVT MemVT = VData.getValueType();
8994 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8995 M->getMemOperand());
8996}
8997
8998SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8999 SelectionDAG &DAG) const {
9000 unsigned IntrID = Op.getConstantOperandVal(1);
9001 SDLoc DL(Op);
9002
9003 switch (IntrID) {
9004 case Intrinsic::amdgcn_ds_ordered_add:
9005 case Intrinsic::amdgcn_ds_ordered_swap: {
9006 MemSDNode *M = cast<MemSDNode>(Op);
9007 SDValue Chain = M->getOperand(0);
9008 SDValue M0 = M->getOperand(2);
9009 SDValue Value = M->getOperand(3);
9010 unsigned IndexOperand = M->getConstantOperandVal(7);
9011 unsigned WaveRelease = M->getConstantOperandVal(8);
9012 unsigned WaveDone = M->getConstantOperandVal(9);
9013
9014 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9015 IndexOperand &= ~0x3f;
9016 unsigned CountDw = 0;
9017
9018 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9019 CountDw = (IndexOperand >> 24) & 0xf;
9020 IndexOperand &= ~(0xf << 24);
9021
9022 if (CountDw < 1 || CountDw > 4) {
9024 "ds_ordered_count: dword count must be between 1 and 4");
9025 }
9026 }
9027
9028 if (IndexOperand)
9029 report_fatal_error("ds_ordered_count: bad index operand");
9030
9031 if (WaveDone && !WaveRelease)
9032 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9033
9034 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9035 unsigned ShaderType =
9037 unsigned Offset0 = OrderedCountIndex << 2;
9038 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9039
9040 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9041 Offset1 |= (CountDw - 1) << 6;
9042
9043 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9044 Offset1 |= ShaderType << 2;
9045
9046 unsigned Offset = Offset0 | (Offset1 << 8);
9047
9048 SDValue Ops[] = {
9049 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9050 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9051 };
9053 M->getVTList(), Ops, M->getMemoryVT(),
9054 M->getMemOperand());
9055 }
9056 case Intrinsic::amdgcn_raw_buffer_load:
9057 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9058 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9059 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9060 case Intrinsic::amdgcn_raw_buffer_load_format:
9061 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9062 const bool IsFormat =
9063 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9064 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9065
9066 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9067 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9068 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9069 SDValue Ops[] = {
9070 Op.getOperand(0), // Chain
9071 Rsrc, // rsrc
9072 DAG.getConstant(0, DL, MVT::i32), // vindex
9073 VOffset, // voffset
9074 SOffset, // soffset
9075 Offset, // offset
9076 Op.getOperand(5), // cachepolicy, swizzled buffer
9077 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9078 };
9079
9080 auto *M = cast<MemSDNode>(Op);
9081 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9082 }
9083 case Intrinsic::amdgcn_struct_buffer_load:
9084 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9085 case Intrinsic::amdgcn_struct_buffer_load_format:
9086 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9087 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9088 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9089 const bool IsFormat =
9090 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9091 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9092
9093 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9094 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9095 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9096 SDValue Ops[] = {
9097 Op.getOperand(0), // Chain
9098 Rsrc, // rsrc
9099 Op.getOperand(3), // vindex
9100 VOffset, // voffset
9101 SOffset, // soffset
9102 Offset, // offset
9103 Op.getOperand(6), // cachepolicy, swizzled buffer
9104 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9105 };
9106
9107 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9108 }
9109 case Intrinsic::amdgcn_raw_tbuffer_load:
9110 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9111 MemSDNode *M = cast<MemSDNode>(Op);
9112 EVT LoadVT = Op.getValueType();
9113 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9114 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9115 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9116
9117 SDValue Ops[] = {
9118 Op.getOperand(0), // Chain
9119 Rsrc, // rsrc
9120 DAG.getConstant(0, DL, MVT::i32), // vindex
9121 VOffset, // voffset
9122 SOffset, // soffset
9123 Offset, // offset
9124 Op.getOperand(5), // format
9125 Op.getOperand(6), // cachepolicy, swizzled buffer
9126 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9127 };
9128
9129 if (LoadVT.getScalarType() == MVT::f16)
9130 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9131 Ops);
9132 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9133 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9134 DAG);
9135 }
9136 case Intrinsic::amdgcn_struct_tbuffer_load:
9137 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9138 MemSDNode *M = cast<MemSDNode>(Op);
9139 EVT LoadVT = Op.getValueType();
9140 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9141 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9142 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9143
9144 SDValue Ops[] = {
9145 Op.getOperand(0), // Chain
9146 Rsrc, // rsrc
9147 Op.getOperand(3), // vindex
9148 VOffset, // voffset
9149 SOffset, // soffset
9150 Offset, // offset
9151 Op.getOperand(6), // format
9152 Op.getOperand(7), // cachepolicy, swizzled buffer
9153 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9154 };
9155
9156 if (LoadVT.getScalarType() == MVT::f16)
9157 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9158 Ops);
9159 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9160 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9161 DAG);
9162 }
9163 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9164 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9165 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9166 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9167 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9168 return lowerStructBufferAtomicIntrin(Op, DAG,
9170 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9171 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9172 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9173 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9174 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9175 return lowerStructBufferAtomicIntrin(Op, DAG,
9177 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9178 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9179 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9180 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9182 return lowerStructBufferAtomicIntrin(Op, DAG,
9184 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9186 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9187 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9188 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9189 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9190 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9192 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9193 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9195 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9196 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9197 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9198 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9199 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9200 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9201 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9202 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9203 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9204 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9205 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9207 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9208 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9209 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9210 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9211 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9212 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9213 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9214 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9215 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9216 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9217 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9218 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9219 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9220 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9221 return lowerRawBufferAtomicIntrin(Op, DAG,
9223 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9224 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9225 return lowerStructBufferAtomicIntrin(Op, DAG,
9227 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9229 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9230 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9231 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9232 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9233 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9234 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9235 return lowerStructBufferAtomicIntrin(Op, DAG,
9237 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9239 return lowerStructBufferAtomicIntrin(Op, DAG,
9241 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9243 return lowerStructBufferAtomicIntrin(Op, DAG,
9245 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9247 return lowerStructBufferAtomicIntrin(Op, DAG,
9249 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9251 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9252 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9254 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9255 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9257 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9258 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9260 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9261 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9262 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9263 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9264 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9265 return lowerStructBufferAtomicIntrin(Op, DAG,
9267
9268 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9269 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9270 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9271 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9272 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9273 SDValue Ops[] = {
9274 Op.getOperand(0), // Chain
9275 Op.getOperand(2), // src
9276 Op.getOperand(3), // cmp
9277 Rsrc, // rsrc
9278 DAG.getConstant(0, DL, MVT::i32), // vindex
9279 VOffset, // voffset
9280 SOffset, // soffset
9281 Offset, // offset
9282 Op.getOperand(7), // cachepolicy
9283 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9284 };
9285 EVT VT = Op.getValueType();
9286 auto *M = cast<MemSDNode>(Op);
9287
9289 Op->getVTList(), Ops, VT,
9290 M->getMemOperand());
9291 }
9292 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9294 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9295 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9296 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9297 SDValue Ops[] = {
9298 Op.getOperand(0), // Chain
9299 Op.getOperand(2), // src
9300 Op.getOperand(3), // cmp
9301 Rsrc, // rsrc
9302 Op.getOperand(5), // vindex
9303 VOffset, // voffset
9304 SOffset, // soffset
9305 Offset, // offset
9306 Op.getOperand(8), // cachepolicy
9307 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9308 };
9309 EVT VT = Op.getValueType();
9310 auto *M = cast<MemSDNode>(Op);
9311
9313 Op->getVTList(), Ops, VT,
9314 M->getMemOperand());
9315 }
9316 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9317 MemSDNode *M = cast<MemSDNode>(Op);
9318 SDValue NodePtr = M->getOperand(2);
9319 SDValue RayExtent = M->getOperand(3);
9320 SDValue RayOrigin = M->getOperand(4);
9321 SDValue RayDir = M->getOperand(5);
9322 SDValue RayInvDir = M->getOperand(6);
9323 SDValue TDescr = M->getOperand(7);
9324
9325 assert(NodePtr.getValueType() == MVT::i32 ||
9326 NodePtr.getValueType() == MVT::i64);
9327 assert(RayDir.getValueType() == MVT::v3f16 ||
9328 RayDir.getValueType() == MVT::v3f32);
9329
9330 if (!Subtarget->hasGFX10_AEncoding()) {
9331 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9332 return SDValue();
9333 }
9334
9335 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9336 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9337 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9338 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9339 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9340 const unsigned NumVDataDwords = 4;
9341 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9342 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9343 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9344 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9345 IsGFX12Plus;
9346 const unsigned BaseOpcodes[2][2] = {
9347 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9348 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9349 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9350 int Opcode;
9351 if (UseNSA) {
9352 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9353 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9354 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9355 : AMDGPU::MIMGEncGfx10NSA,
9356 NumVDataDwords, NumVAddrDwords);
9357 } else {
9358 assert(!IsGFX12Plus);
9359 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9360 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9361 : AMDGPU::MIMGEncGfx10Default,
9362 NumVDataDwords, NumVAddrDwords);
9363 }
9364 assert(Opcode != -1);
9365
9367
9368 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9370 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9371 if (Lanes[0].getValueSizeInBits() == 32) {
9372 for (unsigned I = 0; I < 3; ++I)
9373 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9374 } else {
9375 if (IsAligned) {
9376 Ops.push_back(DAG.getBitcast(
9377 MVT::i32,
9378 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9379 Ops.push_back(Lanes[2]);
9380 } else {
9381 SDValue Elt0 = Ops.pop_back_val();
9382 Ops.push_back(DAG.getBitcast(
9383 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9384 Ops.push_back(DAG.getBitcast(
9385 MVT::i32,
9386 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9387 }
9388 }
9389 };
9390
9391 if (UseNSA && IsGFX11Plus) {
9392 Ops.push_back(NodePtr);
9393 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9394 Ops.push_back(RayOrigin);
9395 if (IsA16) {
9396 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9397 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9398 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9399 for (unsigned I = 0; I < 3; ++I) {
9400 MergedLanes.push_back(DAG.getBitcast(
9401 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9402 {DirLanes[I], InvDirLanes[I]})));
9403 }
9404 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9405 } else {
9406 Ops.push_back(RayDir);
9407 Ops.push_back(RayInvDir);
9408 }
9409 } else {
9410 if (Is64)
9411 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9412 2);
9413 else
9414 Ops.push_back(NodePtr);
9415
9416 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9417 packLanes(RayOrigin, true);
9418 packLanes(RayDir, true);
9419 packLanes(RayInvDir, false);
9420 }
9421
9422 if (!UseNSA) {
9423 // Build a single vector containing all the operands so far prepared.
9424 if (NumVAddrDwords > 12) {
9425 SDValue Undef = DAG.getUNDEF(MVT::i32);
9426 Ops.append(16 - Ops.size(), Undef);
9427 }
9428 assert(Ops.size() >= 8 && Ops.size() <= 12);
9429 SDValue MergedOps =
9430 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9431 Ops.clear();
9432 Ops.push_back(MergedOps);
9433 }
9434
9435 Ops.push_back(TDescr);
9436 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9437 Ops.push_back(M->getChain());
9438
9439 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9440 MachineMemOperand *MemRef = M->getMemOperand();
9441 DAG.setNodeMemRefs(NewNode, {MemRef});
9442 return SDValue(NewNode, 0);
9443 }
9444 case Intrinsic::amdgcn_global_atomic_fmin_num:
9445 case Intrinsic::amdgcn_global_atomic_fmax_num:
9446 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9447 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9448 MemSDNode *M = cast<MemSDNode>(Op);
9449 SDValue Ops[] = {
9450 M->getOperand(0), // Chain
9451 M->getOperand(2), // Ptr
9452 M->getOperand(3) // Value
9453 };
9454 unsigned Opcode = 0;
9455 switch (IntrID) {
9456 case Intrinsic::amdgcn_global_atomic_fmin_num:
9457 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9458 Opcode = ISD::ATOMIC_LOAD_FMIN;
9459 break;
9460 }
9461 case Intrinsic::amdgcn_global_atomic_fmax_num:
9462 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9463 Opcode = ISD::ATOMIC_LOAD_FMAX;
9464 break;
9465 }
9466 default:
9467 llvm_unreachable("unhandled atomic opcode");
9468 }
9469 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9470 Ops, M->getMemOperand());
9471 }
9472 case Intrinsic::amdgcn_s_get_barrier_state:
9473 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9474 SDValue Chain = Op->getOperand(0);
9476 unsigned Opc;
9477
9478 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9479 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9480 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9481 BarID = (BarID >> 4) & 0x3F;
9482 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9483 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9484 Ops.push_back(K);
9485 Ops.push_back(Chain);
9486 } else {
9487 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9488 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9489 SDValue M0Val;
9490 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9491 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9492 M0Val = SDValue(
9493 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9494 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9495 0);
9496 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9497 } else
9498 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9499 }
9500
9501 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9502 return SDValue(NewMI, 0);
9503 }
9504 default:
9505
9506 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9508 return lowerImage(Op, ImageDimIntr, DAG, true);
9509
9510 return SDValue();
9511 }
9512}
9513
9514// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9515// dwordx4 if on SI and handle TFE loads.
9516SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9517 SDVTList VTList,
9518 ArrayRef<SDValue> Ops, EVT MemVT,
9519 MachineMemOperand *MMO,
9520 SelectionDAG &DAG) const {
9521 LLVMContext &C = *DAG.getContext();
9523 EVT VT = VTList.VTs[0];
9524
9525 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9526 bool IsTFE = VTList.NumVTs == 3;
9527 if (IsTFE) {
9528 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9529 unsigned NumOpDWords = NumValueDWords + 1;
9530 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9531 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9532 MachineMemOperand *OpDWordsMMO =
9533 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9534 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9535 OpDWordsVT, OpDWordsMMO, DAG);
9537 DAG.getVectorIdxConstant(NumValueDWords, DL));
9538 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9539 SDValue ValueDWords =
9540 NumValueDWords == 1
9541 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9543 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9544 ZeroIdx);
9545 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9546 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9547 }
9548
9549 if (!Subtarget->hasDwordx3LoadStores() &&
9550 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9551 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9552 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9553 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9554 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9555 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9556 WidenedMemVT, WidenedMMO);
9558 DAG.getVectorIdxConstant(0, DL));
9559 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9560 }
9561
9562 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9563}
9564
9565SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9566 bool ImageStore) const {
9567 EVT StoreVT = VData.getValueType();
9568
9569 // No change for f16 and legal vector D16 types.
9570 if (!StoreVT.isVector())
9571 return VData;
9572
9573 SDLoc DL(VData);
9574 unsigned NumElements = StoreVT.getVectorNumElements();
9575
9576 if (Subtarget->hasUnpackedD16VMem()) {
9577 // We need to unpack the packed data to store.
9578 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9579 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9580
9581 EVT EquivStoreVT =
9582 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9583 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9584 return DAG.UnrollVectorOp(ZExt.getNode());
9585 }
9586
9587 // The sq block of gfx8.1 does not estimate register use correctly for d16
9588 // image store instructions. The data operand is computed as if it were not a
9589 // d16 image instruction.
9590 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9591 // Bitcast to i16
9592 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9593 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9594
9595 // Decompose into scalars
9597 DAG.ExtractVectorElements(IntVData, Elts);
9598
9599 // Group pairs of i16 into v2i16 and bitcast to i32
9600 SmallVector<SDValue, 4> PackedElts;
9601 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9602 SDValue Pair =
9603 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9604 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9605 PackedElts.push_back(IntPair);
9606 }
9607 if ((NumElements % 2) == 1) {
9608 // Handle v3i16
9609 unsigned I = Elts.size() / 2;
9610 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9611 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9612 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9613 PackedElts.push_back(IntPair);
9614 }
9615
9616 // Pad using UNDEF
9617 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9618
9619 // Build final vector
9620 EVT VecVT =
9621 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9622 return DAG.getBuildVector(VecVT, DL, PackedElts);
9623 }
9624
9625 if (NumElements == 3) {
9626 EVT IntStoreVT =
9628 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9629
9630 EVT WidenedStoreVT = EVT::getVectorVT(
9631 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9632 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9633 WidenedStoreVT.getStoreSizeInBits());
9634 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9635 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9636 }
9637
9638 assert(isTypeLegal(StoreVT));
9639 return VData;
9640}
9641
9642SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9643 SelectionDAG &DAG) const {
9644 SDLoc DL(Op);
9645 SDValue Chain = Op.getOperand(0);
9646 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9648
9649 switch (IntrinsicID) {
9650 case Intrinsic::amdgcn_exp_compr: {
9651 if (!Subtarget->hasCompressedExport()) {
9652 DiagnosticInfoUnsupported BadIntrin(
9654 "intrinsic not supported on subtarget", DL.getDebugLoc());
9655 DAG.getContext()->diagnose(BadIntrin);
9656 }
9657 SDValue Src0 = Op.getOperand(4);
9658 SDValue Src1 = Op.getOperand(5);
9659 // Hack around illegal type on SI by directly selecting it.
9660 if (isTypeLegal(Src0.getValueType()))
9661 return SDValue();
9662
9663 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9664 SDValue Undef = DAG.getUNDEF(MVT::f32);
9665 const SDValue Ops[] = {
9666 Op.getOperand(2), // tgt
9667 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9668 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9669 Undef, // src2
9670 Undef, // src3
9671 Op.getOperand(7), // vm
9672 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9673 Op.getOperand(3), // en
9674 Op.getOperand(0) // Chain
9675 };
9676
9677 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9678 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9679 }
9680 case Intrinsic::amdgcn_s_barrier:
9681 case Intrinsic::amdgcn_s_barrier_signal:
9682 case Intrinsic::amdgcn_s_barrier_wait: {
9685 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9686 if (WGSize <= ST.getWavefrontSize()) {
9687 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9688 // s_barrier/s_barrier_wait to wave_barrier.
9689 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9690 return Op.getOperand(0);
9691 else
9692 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9693 MVT::Other, Op.getOperand(0)),
9694 0);
9695 }
9696 }
9697
9698 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9699 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9700 SDValue K =
9702 SDValue BarSignal =
9703 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9704 MVT::Other, K, Op.getOperand(0)),
9705 0);
9706 SDValue BarWait =
9707 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9708 BarSignal.getValue(0)),
9709 0);
9710 return BarWait;
9711 }
9712
9713 return SDValue();
9714 };
9715
9716 case Intrinsic::amdgcn_struct_tbuffer_store:
9717 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9718 SDValue VData = Op.getOperand(2);
9719 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9720 if (IsD16)
9721 VData = handleD16VData(VData, DAG);
9722 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9723 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9724 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9725 SDValue Ops[] = {
9726 Chain,
9727 VData, // vdata
9728 Rsrc, // rsrc
9729 Op.getOperand(4), // vindex
9730 VOffset, // voffset
9731 SOffset, // soffset
9732 Offset, // offset
9733 Op.getOperand(7), // format
9734 Op.getOperand(8), // cachepolicy, swizzled buffer
9735 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9736 };
9737 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9739 MemSDNode *M = cast<MemSDNode>(Op);
9740 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9741 M->getMemoryVT(), M->getMemOperand());
9742 }
9743
9744 case Intrinsic::amdgcn_raw_tbuffer_store:
9745 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9746 SDValue VData = Op.getOperand(2);
9747 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9748 if (IsD16)
9749 VData = handleD16VData(VData, DAG);
9750 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9751 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9752 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9753 SDValue Ops[] = {
9754 Chain,
9755 VData, // vdata
9756 Rsrc, // rsrc
9757 DAG.getConstant(0, DL, MVT::i32), // vindex
9758 VOffset, // voffset
9759 SOffset, // soffset
9760 Offset, // offset
9761 Op.getOperand(6), // format
9762 Op.getOperand(7), // cachepolicy, swizzled buffer
9763 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9764 };
9765 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9767 MemSDNode *M = cast<MemSDNode>(Op);
9768 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9769 M->getMemoryVT(), M->getMemOperand());
9770 }
9771
9772 case Intrinsic::amdgcn_raw_buffer_store:
9773 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9774 case Intrinsic::amdgcn_raw_buffer_store_format:
9775 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9776 const bool IsFormat =
9777 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9778 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9779
9780 SDValue VData = Op.getOperand(2);
9781 EVT VDataVT = VData.getValueType();
9782 EVT EltType = VDataVT.getScalarType();
9783 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9784 if (IsD16) {
9785 VData = handleD16VData(VData, DAG);
9786 VDataVT = VData.getValueType();
9787 }
9788
9789 if (!isTypeLegal(VDataVT)) {
9790 VData =
9791 DAG.getNode(ISD::BITCAST, DL,
9792 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9793 }
9794
9795 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9796 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9797 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9798 SDValue Ops[] = {
9799 Chain,
9800 VData,
9801 Rsrc,
9802 DAG.getConstant(0, DL, MVT::i32), // vindex
9803 VOffset, // voffset
9804 SOffset, // soffset
9805 Offset, // offset
9806 Op.getOperand(6), // cachepolicy, swizzled buffer
9807 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9808 };
9809 unsigned Opc =
9811 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9812 MemSDNode *M = cast<MemSDNode>(Op);
9813
9814 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9815 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9816 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9817
9818 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9819 M->getMemoryVT(), M->getMemOperand());
9820 }
9821
9822 case Intrinsic::amdgcn_struct_buffer_store:
9823 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9824 case Intrinsic::amdgcn_struct_buffer_store_format:
9825 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9826 const bool IsFormat =
9827 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9828 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9829
9830 SDValue VData = Op.getOperand(2);
9831 EVT VDataVT = VData.getValueType();
9832 EVT EltType = VDataVT.getScalarType();
9833 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9834
9835 if (IsD16) {
9836 VData = handleD16VData(VData, DAG);
9837 VDataVT = VData.getValueType();
9838 }
9839
9840 if (!isTypeLegal(VDataVT)) {
9841 VData =
9842 DAG.getNode(ISD::BITCAST, DL,
9843 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9844 }
9845
9846 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9847 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9848 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9849 SDValue Ops[] = {
9850 Chain,
9851 VData,
9852 Rsrc,
9853 Op.getOperand(4), // vindex
9854 VOffset, // voffset
9855 SOffset, // soffset
9856 Offset, // offset
9857 Op.getOperand(7), // cachepolicy, swizzled buffer
9858 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9859 };
9860 unsigned Opc =
9862 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9863 MemSDNode *M = cast<MemSDNode>(Op);
9864
9865 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9866 EVT VDataType = VData.getValueType().getScalarType();
9867 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9868 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9869
9870 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9871 M->getMemoryVT(), M->getMemOperand());
9872 }
9873 case Intrinsic::amdgcn_raw_buffer_load_lds:
9874 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9875 case Intrinsic::amdgcn_struct_buffer_load_lds:
9876 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9877 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9878 unsigned Opc;
9879 bool HasVIndex =
9880 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9881 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9882 unsigned OpOffset = HasVIndex ? 1 : 0;
9883 SDValue VOffset = Op.getOperand(5 + OpOffset);
9884 bool HasVOffset = !isNullConstant(VOffset);
9885 unsigned Size = Op->getConstantOperandVal(4);
9886
9887 switch (Size) {
9888 default:
9889 return SDValue();
9890 case 1:
9891 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9892 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9893 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9894 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9895 break;
9896 case 2:
9897 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9898 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9899 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9900 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9901 break;
9902 case 4:
9903 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9904 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9905 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9906 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9907 break;
9908 case 12:
9909 if (!Subtarget->hasLDSLoadB96_B128())
9910 return SDValue();
9911 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9912 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9913 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9914 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9915 break;
9916 case 16:
9917 if (!Subtarget->hasLDSLoadB96_B128())
9918 return SDValue();
9919 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9920 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9921 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9922 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9923 break;
9924 }
9925
9926 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9927
9929
9930 if (HasVIndex && HasVOffset)
9931 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9932 {Op.getOperand(5), // VIndex
9933 VOffset}));
9934 else if (HasVIndex)
9935 Ops.push_back(Op.getOperand(5));
9936 else if (HasVOffset)
9937 Ops.push_back(VOffset);
9938
9939 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9940 Ops.push_back(Rsrc);
9941 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9942 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9943 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9944 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9946 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9947 DL, MVT::i8)); // cpol
9949 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9950 ? 1
9951 : 0,
9952 DL, MVT::i8)); // swz
9953 Ops.push_back(M0Val.getValue(0)); // Chain
9954 Ops.push_back(M0Val.getValue(1)); // Glue
9955
9956 auto *M = cast<MemSDNode>(Op);
9957 MachineMemOperand *LoadMMO = M->getMemOperand();
9958 // Don't set the offset value here because the pointer points to the base of
9959 // the buffer.
9960 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9961
9962 MachinePointerInfo StorePtrI = LoadPtrI;
9963 LoadPtrI.V = PoisonValue::get(
9967
9968 auto F = LoadMMO->getFlags() &
9970 LoadMMO =
9972 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9973
9975 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9976 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9977
9978 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9979 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9980
9981 return SDValue(Load, 0);
9982 }
9983 case Intrinsic::amdgcn_global_load_lds: {
9984 unsigned Opc;
9985 unsigned Size = Op->getConstantOperandVal(4);
9986 switch (Size) {
9987 default:
9988 return SDValue();
9989 case 1:
9990 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9991 break;
9992 case 2:
9993 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9994 break;
9995 case 4:
9996 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9997 break;
9998 case 12:
9999 if (!Subtarget->hasLDSLoadB96_B128())
10000 return SDValue();
10001 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10002 break;
10003 case 16:
10004 if (!Subtarget->hasLDSLoadB96_B128())
10005 return SDValue();
10006 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10007 break;
10008 }
10009
10010 auto *M = cast<MemSDNode>(Op);
10011 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10012
10014
10015 SDValue Addr = Op.getOperand(2); // Global ptr
10016 SDValue VOffset;
10017 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10018 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10019 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10020 SDValue LHS = Addr.getOperand(0);
10021 SDValue RHS = Addr.getOperand(1);
10022
10023 if (LHS->isDivergent())
10024 std::swap(LHS, RHS);
10025
10026 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10027 RHS.getOperand(0).getValueType() == MVT::i32) {
10028 // add (i64 sgpr), (zero_extend (i32 vgpr))
10029 Addr = LHS;
10030 VOffset = RHS.getOperand(0);
10031 }
10032 }
10033
10034 Ops.push_back(Addr);
10035 if (!Addr->isDivergent()) {
10036 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10037 if (!VOffset)
10038 VOffset =
10039 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10040 DAG.getTargetConstant(0, DL, MVT::i32)),
10041 0);
10042 Ops.push_back(VOffset);
10043 }
10044
10045 Ops.push_back(Op.getOperand(5)); // Offset
10046 Ops.push_back(Op.getOperand(6)); // CPol
10047 Ops.push_back(M0Val.getValue(0)); // Chain
10048 Ops.push_back(M0Val.getValue(1)); // Glue
10049
10050 MachineMemOperand *LoadMMO = M->getMemOperand();
10051 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10052 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10053 MachinePointerInfo StorePtrI = LoadPtrI;
10054 LoadPtrI.V = PoisonValue::get(
10058 auto F = LoadMMO->getFlags() &
10060 LoadMMO =
10062 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10064 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10065 LoadMMO->getAAInfo());
10066
10067 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10068 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10069
10070 return SDValue(Load, 0);
10071 }
10072 case Intrinsic::amdgcn_end_cf:
10073 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10074 Op->getOperand(2), Chain),
10075 0);
10076 case Intrinsic::amdgcn_s_barrier_init:
10077 case Intrinsic::amdgcn_s_barrier_signal_var: {
10078 // these two intrinsics have two operands: barrier pointer and member count
10079 SDValue Chain = Op->getOperand(0);
10081 SDValue BarOp = Op->getOperand(2);
10082 SDValue CntOp = Op->getOperand(3);
10083 SDValue M0Val;
10084 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10085 ? AMDGPU::S_BARRIER_INIT_M0
10086 : AMDGPU::S_BARRIER_SIGNAL_M0;
10087 // extract the BarrierID from bits 4-9 of BarOp
10088 SDValue BarID;
10089 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10090 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10091 BarID =
10092 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10093 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10094 0);
10095 // Member count should be put into M0[ShAmt:+6]
10096 // Barrier ID should be put into M0[5:0]
10097 M0Val =
10098 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10099 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10100 0);
10101 constexpr unsigned ShAmt = 16;
10102 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10103 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10104
10105 M0Val = SDValue(
10106 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10107
10108 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10109
10110 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10111 return SDValue(NewMI, 0);
10112 }
10113 case Intrinsic::amdgcn_s_barrier_join: {
10114 // these three intrinsics have one operand: barrier pointer
10115 SDValue Chain = Op->getOperand(0);
10117 SDValue BarOp = Op->getOperand(2);
10118 unsigned Opc;
10119
10120 if (isa<ConstantSDNode>(BarOp)) {
10121 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10122 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10123
10124 // extract the BarrierID from bits 4-9 of the immediate
10125 unsigned BarID = (BarVal >> 4) & 0x3F;
10126 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10127 Ops.push_back(K);
10128 Ops.push_back(Chain);
10129 } else {
10130 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10131
10132 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10133 SDValue M0Val;
10134 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10135 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10136 M0Val =
10137 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10138 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10139 0);
10140 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10141 }
10142
10143 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10144 return SDValue(NewMI, 0);
10145 }
10146 case Intrinsic::amdgcn_s_prefetch_data: {
10147 // For non-global address space preserve the chain and remove the call.
10148 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10149 return Op.getOperand(0);
10150 return Op;
10151 }
10152 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10153 SDValue Ops[] = {
10154 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10155 Op.getOperand(3), // offset
10156 Op.getOperand(4), // length
10157 };
10158
10159 MemSDNode *M = cast<MemSDNode>(Op);
10161 Op->getVTList(), Ops, M->getMemoryVT(),
10162 M->getMemOperand());
10163 }
10164 default: {
10165 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10167 return lowerImage(Op, ImageDimIntr, DAG, true);
10168
10169 return Op;
10170 }
10171 }
10172}
10173
10174// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10175// offset (the offset that is included in bounds checking and swizzling, to be
10176// split between the instruction's voffset and immoffset fields) and soffset
10177// (the offset that is excluded from bounds checking and swizzling, to go in
10178// the instruction's soffset field). This function takes the first kind of
10179// offset and figures out how to split it between voffset and immoffset.
10180std::pair<SDValue, SDValue>
10181SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10182 SDLoc DL(Offset);
10183 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10184 SDValue N0 = Offset;
10185 ConstantSDNode *C1 = nullptr;
10186
10187 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10188 N0 = SDValue();
10189 else if (DAG.isBaseWithConstantOffset(N0)) {
10190 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10191 N0 = N0.getOperand(0);
10192 }
10193
10194 if (C1) {
10195 unsigned ImmOffset = C1->getZExtValue();
10196 // If the immediate value is too big for the immoffset field, put only bits
10197 // that would normally fit in the immoffset field. The remaining value that
10198 // is copied/added for the voffset field is a large power of 2, and it
10199 // stands more chance of being CSEd with the copy/add for another similar
10200 // load/store.
10201 // However, do not do that rounding down if that is a negative
10202 // number, as it appears to be illegal to have a negative offset in the
10203 // vgpr, even if adding the immediate offset makes it positive.
10204 unsigned Overflow = ImmOffset & ~MaxImm;
10205 ImmOffset -= Overflow;
10206 if ((int32_t)Overflow < 0) {
10207 Overflow += ImmOffset;
10208 ImmOffset = 0;
10209 }
10210 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10211 if (Overflow) {
10212 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10213 if (!N0)
10214 N0 = OverflowVal;
10215 else {
10216 SDValue Ops[] = {N0, OverflowVal};
10217 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10218 }
10219 }
10220 }
10221 if (!N0)
10222 N0 = DAG.getConstant(0, DL, MVT::i32);
10223 if (!C1)
10224 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10225 return {N0, SDValue(C1, 0)};
10226}
10227
10228// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10229// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10230// pointed to by Offsets.
10231void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10232 SelectionDAG &DAG, SDValue *Offsets,
10233 Align Alignment) const {
10235 SDLoc DL(CombinedOffset);
10236 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10237 uint32_t Imm = C->getZExtValue();
10238 uint32_t SOffset, ImmOffset;
10239 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10240 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10241 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10242 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10243 return;
10244 }
10245 }
10246 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10247 SDValue N0 = CombinedOffset.getOperand(0);
10248 SDValue N1 = CombinedOffset.getOperand(1);
10249 uint32_t SOffset, ImmOffset;
10250 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10251 if (Offset >= 0 &&
10252 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10253 Offsets[0] = N0;
10254 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10255 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10256 return;
10257 }
10258 }
10259
10260 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10261 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10262 : DAG.getConstant(0, DL, MVT::i32);
10263
10264 Offsets[0] = CombinedOffset;
10265 Offsets[1] = SOffsetZero;
10266 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10267}
10268
10269SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10270 SelectionDAG &DAG) const {
10271 if (!MaybePointer.getValueType().isScalarInteger())
10272 return MaybePointer;
10273
10274 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10275 return Rsrc;
10276}
10277
10278// Wrap a global or flat pointer into a buffer intrinsic using the flags
10279// specified in the intrinsic.
10280SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10281 SelectionDAG &DAG) const {
10282 SDLoc Loc(Op);
10283
10284 SDValue Pointer = Op->getOperand(1);
10285 SDValue Stride = Op->getOperand(2);
10286 SDValue NumRecords = Op->getOperand(3);
10287 SDValue Flags = Op->getOperand(4);
10288
10289 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10290 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10291 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10292 std::optional<uint32_t> ConstStride = std::nullopt;
10293 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10294 ConstStride = ConstNode->getZExtValue();
10295
10296 SDValue NewHighHalf = Masked;
10297 if (!ConstStride || *ConstStride != 0) {
10298 SDValue ShiftedStride;
10299 if (ConstStride) {
10300 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10301 } else {
10302 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10303 ShiftedStride =
10304 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10305 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10306 }
10307 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10308 }
10309
10310 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10311 NewHighHalf, NumRecords, Flags);
10312 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10313 return RsrcPtr;
10314}
10315
10316// Handle 8 bit and 16 bit buffer loads
10317SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10318 EVT LoadVT, SDLoc DL,
10320 MachineMemOperand *MMO,
10321 bool IsTFE) const {
10322 EVT IntVT = LoadVT.changeTypeToInteger();
10323
10324 if (IsTFE) {
10325 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10329 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10330 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10331 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10333 DAG.getConstant(1, DL, MVT::i32));
10334 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10335 DAG.getConstant(0, DL, MVT::i32));
10336 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10337 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10338 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10339 }
10340
10341 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10344
10345 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10346 SDValue BufferLoad =
10347 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10348 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10349 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10350
10351 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10352}
10353
10354// Handle 8 bit and 16 bit buffer stores
10355SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10356 EVT VDataType, SDLoc DL,
10357 SDValue Ops[],
10358 MemSDNode *M) const {
10359 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10360 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10361
10362 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10363 Ops[1] = BufferStoreExt;
10364 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10366 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10367 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10368 M->getMemOperand());
10369}
10370
10372 SDValue Op, const SDLoc &SL, EVT VT) {
10373 if (VT.bitsLT(Op.getValueType()))
10374 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10375
10376 switch (ExtType) {
10377 case ISD::SEXTLOAD:
10378 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10379 case ISD::ZEXTLOAD:
10380 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10381 case ISD::EXTLOAD:
10382 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10383 case ISD::NON_EXTLOAD:
10384 return Op;
10385 }
10386
10387 llvm_unreachable("invalid ext type");
10388}
10389
10390// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10391// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10392SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10393 DAGCombinerInfo &DCI) const {
10394 SelectionDAG &DAG = DCI.DAG;
10395 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10396 return SDValue();
10397
10398 // FIXME: Constant loads should all be marked invariant.
10399 unsigned AS = Ld->getAddressSpace();
10400 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10402 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10403 return SDValue();
10404
10405 // Don't do this early, since it may interfere with adjacent load merging for
10406 // illegal types. We can avoid losing alignment information for exotic types
10407 // pre-legalize.
10408 EVT MemVT = Ld->getMemoryVT();
10409 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10410 MemVT.getSizeInBits() >= 32)
10411 return SDValue();
10412
10413 SDLoc SL(Ld);
10414
10415 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10416 "unexpected vector extload");
10417
10418 // TODO: Drop only high part of range.
10419 SDValue Ptr = Ld->getBasePtr();
10420 SDValue NewLoad = DAG.getLoad(
10421 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10422 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10423 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10424 nullptr); // Drop ranges
10425
10426 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10427 if (MemVT.isFloatingPoint()) {
10429 "unexpected fp extload");
10430 TruncVT = MemVT.changeTypeToInteger();
10431 }
10432
10433 SDValue Cvt = NewLoad;
10434 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10435 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10436 DAG.getValueType(TruncVT));
10437 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10439 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10440 } else {
10442 }
10443
10444 EVT VT = Ld->getValueType(0);
10445 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10446
10447 DCI.AddToWorklist(Cvt.getNode());
10448
10449 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10450 // the appropriate extension from the 32-bit load.
10451 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10452 DCI.AddToWorklist(Cvt.getNode());
10453
10454 // Handle conversion back to floating point if necessary.
10455 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10456
10457 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10458}
10459
10461 const SIMachineFunctionInfo &Info) {
10462 // TODO: Should check if the address can definitely not access stack.
10463 if (Info.isEntryFunction())
10464 return Info.getUserSGPRInfo().hasFlatScratchInit();
10465 return true;
10466}
10467
10468SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10469 SDLoc DL(Op);
10470 LoadSDNode *Load = cast<LoadSDNode>(Op);
10471 ISD::LoadExtType ExtType = Load->getExtensionType();
10472 EVT MemVT = Load->getMemoryVT();
10473 MachineMemOperand *MMO = Load->getMemOperand();
10474
10475 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10476 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10477 return SDValue();
10478
10479 // FIXME: Copied from PPC
10480 // First, load into 32 bits, then truncate to 1 bit.
10481
10482 SDValue Chain = Load->getChain();
10483 SDValue BasePtr = Load->getBasePtr();
10484
10485 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10486
10487 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10488 RealMemVT, MMO);
10489
10490 if (!MemVT.isVector()) {
10491 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10492 NewLD.getValue(1)};
10493
10494 return DAG.getMergeValues(Ops, DL);
10495 }
10496
10498 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10499 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10500 DAG.getConstant(I, DL, MVT::i32));
10501
10502 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10503 }
10504
10505 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10506
10507 return DAG.getMergeValues(Ops, DL);
10508 }
10509
10510 if (!MemVT.isVector())
10511 return SDValue();
10512
10513 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10514 "Custom lowering for non-i32 vectors hasn't been implemented.");
10515
10516 Align Alignment = Load->getAlign();
10517 unsigned AS = Load->getAddressSpace();
10518 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10519 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10520 return SplitVectorLoad(Op, DAG);
10521 }
10522
10525 // If there is a possibility that flat instruction access scratch memory
10526 // then we need to use the same legalization rules we use for private.
10527 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10529 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10532
10533 unsigned NumElements = MemVT.getVectorNumElements();
10534
10535 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10537 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10538 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10540 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10541 Alignment >= Align(4) && NumElements < 32) {
10542 if (MemVT.isPow2VectorType() ||
10543 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10544 return SDValue();
10545 return WidenOrSplitVectorLoad(Op, DAG);
10546 }
10547 // Non-uniform loads will be selected to MUBUF instructions, so they
10548 // have the same legalization requirements as global and private
10549 // loads.
10550 //
10551 }
10552 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10555 if (NumElements > 4)
10556 return SplitVectorLoad(Op, DAG);
10557 // v3 loads not supported on SI.
10558 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10559 return WidenOrSplitVectorLoad(Op, DAG);
10560
10561 // v3 and v4 loads are supported for private and global memory.
10562 return SDValue();
10563 }
10564 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10565 // Depending on the setting of the private_element_size field in the
10566 // resource descriptor, we can only make private accesses up to a certain
10567 // size.
10568 switch (Subtarget->getMaxPrivateElementSize()) {
10569 case 4: {
10570 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10571 return DAG.getMergeValues({Op0, Op1}, DL);
10572 }
10573 case 8:
10574 if (NumElements > 2)
10575 return SplitVectorLoad(Op, DAG);
10576 return SDValue();
10577 case 16:
10578 // Same as global/flat
10579 if (NumElements > 4)
10580 return SplitVectorLoad(Op, DAG);
10581 // v3 loads not supported on SI.
10582 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10583 return WidenOrSplitVectorLoad(Op, DAG);
10584
10585 return SDValue();
10586 default:
10587 llvm_unreachable("unsupported private_element_size");
10588 }
10589 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10590 unsigned Fast = 0;
10591 auto Flags = Load->getMemOperand()->getFlags();
10593 Load->getAlign(), Flags, &Fast) &&
10594 Fast > 1)
10595 return SDValue();
10596
10597 if (MemVT.isVector())
10598 return SplitVectorLoad(Op, DAG);
10599 }
10600
10602 MemVT, *Load->getMemOperand())) {
10603 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10604 return DAG.getMergeValues({Op0, Op1}, DL);
10605 }
10606
10607 return SDValue();
10608}
10609
10610SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10611 EVT VT = Op.getValueType();
10612 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10613 VT.getSizeInBits() == 512)
10614 return splitTernaryVectorOp(Op, DAG);
10615
10616 assert(VT.getSizeInBits() == 64);
10617
10618 SDLoc DL(Op);
10619 SDValue Cond = Op.getOperand(0);
10620
10621 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10622 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10623
10624 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10625 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10626
10627 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10628 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10629
10630 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10631
10632 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10633 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10634
10635 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10636
10637 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10638 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10639}
10640
10641// Catch division cases where we can use shortcuts with rcp and rsq
10642// instructions.
10643SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10644 SelectionDAG &DAG) const {
10645 SDLoc SL(Op);
10646 SDValue LHS = Op.getOperand(0);
10647 SDValue RHS = Op.getOperand(1);
10648 EVT VT = Op.getValueType();
10649 const SDNodeFlags Flags = Op->getFlags();
10650
10651 bool AllowInaccurateRcp =
10652 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10653
10654 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10655 // Without !fpmath accuracy information, we can't do more because we don't
10656 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10657 // f16 is always accurate enough
10658 if (!AllowInaccurateRcp && VT != MVT::f16)
10659 return SDValue();
10660
10661 if (CLHS->isExactlyValue(1.0)) {
10662 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10663 // the CI documentation has a worst case error of 1 ulp.
10664 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10665 // use it as long as we aren't trying to use denormals.
10666 //
10667 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10668
10669 // 1.0 / sqrt(x) -> rsq(x)
10670
10671 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10672 // error seems really high at 2^29 ULP.
10673 // 1.0 / x -> rcp(x)
10674 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10675 }
10676
10677 // Same as for 1.0, but expand the sign out of the constant.
10678 if (CLHS->isExactlyValue(-1.0)) {
10679 // -1.0 / x -> rcp (fneg x)
10680 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10681 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10682 }
10683 }
10684
10685 // For f16 require afn or arcp.
10686 // For f32 require afn.
10687 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10688 return SDValue();
10689
10690 // Turn into multiply by the reciprocal.
10691 // x / y -> x * (1.0 / y)
10692 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10693 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10694}
10695
10696SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10697 SelectionDAG &DAG) const {
10698 SDLoc SL(Op);
10699 SDValue X = Op.getOperand(0);
10700 SDValue Y = Op.getOperand(1);
10701 EVT VT = Op.getValueType();
10702 const SDNodeFlags Flags = Op->getFlags();
10703
10704 bool AllowInaccurateDiv =
10705 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10706 if (!AllowInaccurateDiv)
10707 return SDValue();
10708
10709 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10710 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10711
10712 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10713 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10714
10715 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10716 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10717 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10718 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10719 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10720 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10721}
10722
10723static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10724 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10725 SDNodeFlags Flags) {
10726 if (GlueChain->getNumValues() <= 1) {
10727 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10728 }
10729
10730 assert(GlueChain->getNumValues() == 3);
10731
10732 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10733 switch (Opcode) {
10734 default:
10735 llvm_unreachable("no chain equivalent for opcode");
10736 case ISD::FMUL:
10737 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10738 break;
10739 }
10740
10741 return DAG.getNode(Opcode, SL, VTList,
10742 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10743 Flags);
10744}
10745
10746static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10747 EVT VT, SDValue A, SDValue B, SDValue C,
10748 SDValue GlueChain, SDNodeFlags Flags) {
10749 if (GlueChain->getNumValues() <= 1) {
10750 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10751 }
10752
10753 assert(GlueChain->getNumValues() == 3);
10754
10755 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10756 switch (Opcode) {
10757 default:
10758 llvm_unreachable("no chain equivalent for opcode");
10759 case ISD::FMA:
10760 Opcode = AMDGPUISD::FMA_W_CHAIN;
10761 break;
10762 }
10763
10764 return DAG.getNode(Opcode, SL, VTList,
10765 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10766 Flags);
10767}
10768
10769SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10770 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10771 return FastLowered;
10772
10773 SDLoc SL(Op);
10774 SDValue LHS = Op.getOperand(0);
10775 SDValue RHS = Op.getOperand(1);
10776
10777 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10778 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10779 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10780 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10781 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10782 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10783 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10784 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10785 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10786 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10787 // q16.u = opx(V_CVT_F16_F32, q32.u);
10788 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10789
10790 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10791 unsigned FMADOpCode =
10793
10794 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10795 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10796 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10797 SDValue Rcp =
10798 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10799 SDValue Quot =
10800 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10801 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10802 Op->getFlags());
10803 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10804 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10805 Op->getFlags());
10806 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10807 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10808 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10809 DAG.getConstant(0xff800000, SL, MVT::i32));
10810 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10811 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10812 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10813 DAG.getTargetConstant(0, SL, MVT::i32));
10814 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10815 Op->getFlags());
10816}
10817
10818// Faster 2.5 ULP division that does not support denormals.
10819SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10820 SDNodeFlags Flags = Op->getFlags();
10821 SDLoc SL(Op);
10822 SDValue LHS = Op.getOperand(1);
10823 SDValue RHS = Op.getOperand(2);
10824
10825 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10826
10827 const APFloat K0Val(0x1p+96f);
10828 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10829
10830 const APFloat K1Val(0x1p-32f);
10831 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10832
10833 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10834
10835 EVT SetCCVT =
10836 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10837
10838 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10839
10840 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10841
10842 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10843
10844 // rcp does not support denormals.
10845 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10846
10847 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10848
10849 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10850}
10851
10852// Returns immediate value for setting the F32 denorm mode when using the
10853// S_DENORM_MODE instruction.
10855 const SIMachineFunctionInfo *Info,
10856 const GCNSubtarget *ST) {
10857 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10858 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10859 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10860 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10861}
10862
10863SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10864 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10865 return FastLowered;
10866
10867 // The selection matcher assumes anything with a chain selecting to a
10868 // mayRaiseFPException machine instruction. Since we're introducing a chain
10869 // here, we need to explicitly report nofpexcept for the regular fdiv
10870 // lowering.
10871 SDNodeFlags Flags = Op->getFlags();
10872 Flags.setNoFPExcept(true);
10873
10874 SDLoc SL(Op);
10875 SDValue LHS = Op.getOperand(0);
10876 SDValue RHS = Op.getOperand(1);
10877
10878 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10879
10880 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10881
10882 SDValue DenominatorScaled =
10883 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10884 SDValue NumeratorScaled =
10885 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10886
10887 // Denominator is scaled to not be denormal, so using rcp is ok.
10888 SDValue ApproxRcp =
10889 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10890 SDValue NegDivScale0 =
10891 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10892
10893 using namespace AMDGPU::Hwreg;
10894 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10895 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10896
10897 const MachineFunction &MF = DAG.getMachineFunction();
10899 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10900
10901 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10902 const bool HasDynamicDenormals =
10903 (DenormMode.Input == DenormalMode::Dynamic) ||
10904 (DenormMode.Output == DenormalMode::Dynamic);
10905
10906 SDValue SavedDenormMode;
10907
10908 if (!PreservesDenormals) {
10909 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10910 // lowering. The chain dependence is insufficient, and we need glue. We do
10911 // not need the glue variants in a strictfp function.
10912
10913 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10914
10915 SDValue Glue = DAG.getEntryNode();
10916 if (HasDynamicDenormals) {
10917 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10918 DAG.getVTList(MVT::i32, MVT::Glue),
10919 {BitField, Glue});
10920 SavedDenormMode = SDValue(GetReg, 0);
10921
10922 Glue = DAG.getMergeValues(
10923 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10924 }
10925
10926 SDNode *EnableDenorm;
10927 if (Subtarget->hasDenormModeInst()) {
10928 const SDValue EnableDenormValue =
10929 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10930
10931 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10932 EnableDenormValue)
10933 .getNode();
10934 } else {
10935 const SDValue EnableDenormValue =
10936 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
10937 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10938 {EnableDenormValue, BitField, Glue});
10939 }
10940
10941 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
10942 SDValue(EnableDenorm, 1)};
10943
10944 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10945 }
10946
10947 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10948 ApproxRcp, One, NegDivScale0, Flags);
10949
10950 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10951 ApproxRcp, Fma0, Flags);
10952
10953 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
10954 Fma1, Flags);
10955
10956 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10957 NumeratorScaled, Mul, Flags);
10958
10959 SDValue Fma3 =
10960 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
10961
10962 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10963 NumeratorScaled, Fma3, Flags);
10964
10965 if (!PreservesDenormals) {
10966 SDNode *DisableDenorm;
10967 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10968 const SDValue DisableDenormValue = getSPDenormModeValue(
10969 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10970
10971 DisableDenorm =
10972 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
10973 DisableDenormValue, Fma4.getValue(2))
10974 .getNode();
10975 } else {
10976 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10977 const SDValue DisableDenormValue =
10978 HasDynamicDenormals
10979 ? SavedDenormMode
10980 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10981
10982 DisableDenorm = DAG.getMachineNode(
10983 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10984 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10985 }
10986
10987 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10988 SDValue(DisableDenorm, 0), DAG.getRoot());
10989 DAG.setRoot(OutputChain);
10990 }
10991
10992 SDValue Scale = NumeratorScaled.getValue(1);
10993 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10994 {Fma4, Fma1, Fma3, Scale}, Flags);
10995
10996 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10997}
10998
10999SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11000 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11001 return FastLowered;
11002
11003 SDLoc SL(Op);
11004 SDValue X = Op.getOperand(0);
11005 SDValue Y = Op.getOperand(1);
11006
11007 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11008
11009 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11010
11011 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11012
11013 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11014
11015 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11016
11017 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11018
11019 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11020
11021 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11022
11023 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11024
11025 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11026 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11027
11028 SDValue Fma4 =
11029 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11030
11031 SDValue Scale;
11032
11033 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11034 // Workaround a hardware bug on SI where the condition output from div_scale
11035 // is not usable.
11036
11037 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11038
11039 // Figure out if the scale to use for div_fmas.
11040 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11041 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11042 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11043 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11044
11045 SDValue NumHi =
11046 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11047 SDValue DenHi =
11048 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11049
11050 SDValue Scale0Hi =
11051 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11052 SDValue Scale1Hi =
11053 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11054
11055 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11056 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11057 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11058 } else {
11059 Scale = DivScale1.getValue(1);
11060 }
11061
11062 SDValue Fmas =
11063 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11064
11065 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11066}
11067
11068SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11069 EVT VT = Op.getValueType();
11070
11071 if (VT == MVT::f32)
11072 return LowerFDIV32(Op, DAG);
11073
11074 if (VT == MVT::f64)
11075 return LowerFDIV64(Op, DAG);
11076
11077 if (VT == MVT::f16)
11078 return LowerFDIV16(Op, DAG);
11079
11080 llvm_unreachable("Unexpected type for fdiv");
11081}
11082
11083SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11084 SDLoc dl(Op);
11085 SDValue Val = Op.getOperand(0);
11086 EVT VT = Val.getValueType();
11087 EVT ResultExpVT = Op->getValueType(1);
11088 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11089
11090 SDValue Mant = DAG.getNode(
11092 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11093
11094 SDValue Exp = DAG.getNode(
11095 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11096 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11097
11098 if (Subtarget->hasFractBug()) {
11099 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11100 SDValue Inf =
11102
11103 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11104 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11105 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11106 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11107 }
11108
11109 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11110 return DAG.getMergeValues({Mant, CastExp}, dl);
11111}
11112
11113SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11114 SDLoc DL(Op);
11115 StoreSDNode *Store = cast<StoreSDNode>(Op);
11116 EVT VT = Store->getMemoryVT();
11117
11118 if (VT == MVT::i1) {
11119 return DAG.getTruncStore(
11120 Store->getChain(), DL,
11121 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11122 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11123 }
11124
11125 assert(VT.isVector() &&
11126 Store->getValue().getValueType().getScalarType() == MVT::i32);
11127
11128 unsigned AS = Store->getAddressSpace();
11129 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11130 Store->getAlign().value() < VT.getStoreSize() &&
11131 VT.getSizeInBits() > 32) {
11132 return SplitVectorStore(Op, DAG);
11133 }
11134
11137 // If there is a possibility that flat instruction access scratch memory
11138 // then we need to use the same legalization rules we use for private.
11139 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11141 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11144
11145 unsigned NumElements = VT.getVectorNumElements();
11147 if (NumElements > 4)
11148 return SplitVectorStore(Op, DAG);
11149 // v3 stores not supported on SI.
11150 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11151 return SplitVectorStore(Op, DAG);
11152
11154 VT, *Store->getMemOperand()))
11155 return expandUnalignedStore(Store, DAG);
11156
11157 return SDValue();
11158 }
11159 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11160 switch (Subtarget->getMaxPrivateElementSize()) {
11161 case 4:
11162 return scalarizeVectorStore(Store, DAG);
11163 case 8:
11164 if (NumElements > 2)
11165 return SplitVectorStore(Op, DAG);
11166 return SDValue();
11167 case 16:
11168 if (NumElements > 4 ||
11169 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11170 return SplitVectorStore(Op, DAG);
11171 return SDValue();
11172 default:
11173 llvm_unreachable("unsupported private_element_size");
11174 }
11175 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11176 unsigned Fast = 0;
11177 auto Flags = Store->getMemOperand()->getFlags();
11179 Store->getAlign(), Flags, &Fast) &&
11180 Fast > 1)
11181 return SDValue();
11182
11183 if (VT.isVector())
11184 return SplitVectorStore(Op, DAG);
11185
11186 return expandUnalignedStore(Store, DAG);
11187 }
11188
11189 // Probably an invalid store. If so we'll end up emitting a selection error.
11190 return SDValue();
11191}
11192
11193// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11194SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11195 SDLoc SL(Op);
11196 assert(!Subtarget->has16BitInsts());
11197 SDNodeFlags Flags = Op->getFlags();
11198 SDValue Ext =
11199 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11200
11201 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11202 SDValue Sqrt =
11203 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11204
11205 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11206 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11207}
11208
11209SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11210 SDLoc DL(Op);
11211 SDNodeFlags Flags = Op->getFlags();
11212 MVT VT = Op.getValueType().getSimpleVT();
11213 const SDValue X = Op.getOperand(0);
11214
11215 if (allowApproxFunc(DAG, Flags)) {
11216 // Instruction is 1ulp but ignores denormals.
11217 return DAG.getNode(
11219 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11220 }
11221
11222 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11223 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11224
11225 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11226
11227 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11228
11229 SDValue SqrtX =
11230 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11231
11232 SDValue SqrtS;
11233 if (needsDenormHandlingF32(DAG, X, Flags)) {
11234 SDValue SqrtID =
11235 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11236 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11237
11238 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11239 SDValue SqrtSNextDownInt =
11240 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11241 DAG.getAllOnesConstant(DL, MVT::i32));
11242 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11243
11244 SDValue NegSqrtSNextDown =
11245 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11246
11247 SDValue SqrtVP =
11248 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11249
11250 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11251 DAG.getConstant(1, DL, MVT::i32));
11252 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11253
11254 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11255 SDValue SqrtVS =
11256 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11257
11258 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11259 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11260
11261 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11262 Flags);
11263
11264 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11265 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11266 Flags);
11267 } else {
11268 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11269
11270 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11271
11272 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11273 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11274 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11275
11276 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11277 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11278 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11279
11280 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11281 SDValue SqrtD =
11282 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11283 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11284 }
11285
11286 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11287
11288 SDValue ScaledDown =
11289 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11290
11291 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11292 SDValue IsZeroOrInf =
11293 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11294 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11295
11296 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11297}
11298
11299SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11300 // For double type, the SQRT and RSQ instructions don't have required
11301 // precision, we apply Goldschmidt's algorithm to improve the result:
11302 //
11303 // y0 = rsq(x)
11304 // g0 = x * y0
11305 // h0 = 0.5 * y0
11306 //
11307 // r0 = 0.5 - h0 * g0
11308 // g1 = g0 * r0 + g0
11309 // h1 = h0 * r0 + h0
11310 //
11311 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11312 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11313 // h2 = h1 * r1 + h1
11314 //
11315 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11316 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11317 //
11318 // sqrt(x) = g3
11319
11320 SDNodeFlags Flags = Op->getFlags();
11321
11322 SDLoc DL(Op);
11323
11324 SDValue X = Op.getOperand(0);
11325 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11326
11327 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11328
11329 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11330
11331 // Scale up input if it is too small.
11332 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11333 SDValue ScaleUp =
11334 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11335 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11336
11337 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11338
11339 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11340
11341 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11342 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11343
11344 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11345 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11346
11347 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11348
11349 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11350
11351 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11352 SDValue SqrtD0 =
11353 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11354
11355 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11356
11357 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11358 SDValue SqrtD1 =
11359 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11360
11361 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11362
11363 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11364 SDValue ScaleDown =
11365 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11366 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11367
11368 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11369 // with finite only or nsz because rsq(+/-0) = +/-inf
11370
11371 // TODO: Check for DAZ and expand to subnormals
11372 SDValue IsZeroOrInf =
11373 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11374 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11375
11376 // If x is +INF, +0, or -0, use its original value
11377 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11378 Flags);
11379}
11380
11381SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11382 SDLoc DL(Op);
11383 EVT VT = Op.getValueType();
11384 SDValue Arg = Op.getOperand(0);
11385 SDValue TrigVal;
11386
11387 // Propagate fast-math flags so that the multiply we introduce can be folded
11388 // if Arg is already the result of a multiply by constant.
11389 auto Flags = Op->getFlags();
11390
11391 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11392
11393 if (Subtarget->hasTrigReducedRange()) {
11394 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11395 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11396 } else {
11397 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11398 }
11399
11400 switch (Op.getOpcode()) {
11401 case ISD::FCOS:
11402 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11403 case ISD::FSIN:
11404 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11405 default:
11406 llvm_unreachable("Wrong trig opcode");
11407 }
11408}
11409
11410SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11411 SelectionDAG &DAG) const {
11412 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11413 assert(AtomicNode->isCompareAndSwap());
11414 unsigned AS = AtomicNode->getAddressSpace();
11415
11416 // No custom lowering required for local address space
11418 return Op;
11419
11420 // Non-local address space requires custom lowering for atomic compare
11421 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11422 SDLoc DL(Op);
11423 SDValue ChainIn = Op.getOperand(0);
11424 SDValue Addr = Op.getOperand(1);
11425 SDValue Old = Op.getOperand(2);
11426 SDValue New = Op.getOperand(3);
11427 EVT VT = Op.getValueType();
11428 MVT SimpleVT = VT.getSimpleVT();
11429 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11430
11431 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11432 SDValue Ops[] = {ChainIn, Addr, NewOld};
11433
11435 Op->getVTList(), Ops, VT,
11436 AtomicNode->getMemOperand());
11437}
11438
11439//===----------------------------------------------------------------------===//
11440// Custom DAG optimizations
11441//===----------------------------------------------------------------------===//
11442
11443SDValue
11444SITargetLowering::performUCharToFloatCombine(SDNode *N,
11445 DAGCombinerInfo &DCI) const {
11446 EVT VT = N->getValueType(0);
11447 EVT ScalarVT = VT.getScalarType();
11448 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11449 return SDValue();
11450
11451 SelectionDAG &DAG = DCI.DAG;
11452 SDLoc DL(N);
11453
11454 SDValue Src = N->getOperand(0);
11455 EVT SrcVT = Src.getValueType();
11456
11457 // TODO: We could try to match extracting the higher bytes, which would be
11458 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11459 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11460 // about in practice.
11461 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11462 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11463 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11464 DCI.AddToWorklist(Cvt.getNode());
11465
11466 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11467 if (ScalarVT != MVT::f32) {
11468 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11469 DAG.getTargetConstant(0, DL, MVT::i32));
11470 }
11471 return Cvt;
11472 }
11473 }
11474
11475 return SDValue();
11476}
11477
11478SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11479 DAGCombinerInfo &DCI) const {
11480 SDValue MagnitudeOp = N->getOperand(0);
11481 SDValue SignOp = N->getOperand(1);
11482 SelectionDAG &DAG = DCI.DAG;
11483 SDLoc DL(N);
11484
11485 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11486 // lower half with a copy.
11487 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11488 if (MagnitudeOp.getValueType() == MVT::f64) {
11489 SDValue MagAsVector =
11490 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11491 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11492 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11493 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11494 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11495
11496 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11497
11498 SDValue Vector =
11499 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11500
11501 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11502 }
11503
11504 if (SignOp.getValueType() != MVT::f64)
11505 return SDValue();
11506
11507 // Reduce width of sign operand, we only need the highest bit.
11508 //
11509 // fcopysign f64:x, f64:y ->
11510 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11511 // TODO: In some cases it might make sense to go all the way to f16.
11512 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11513 SDValue SignAsF32 =
11514 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11515 DAG.getConstant(1, DL, MVT::i32));
11516
11517 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11518 SignAsF32);
11519}
11520
11521// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11522// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11523// bits
11524
11525// This is a variant of
11526// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11527//
11528// The normal DAG combiner will do this, but only if the add has one use since
11529// that would increase the number of instructions.
11530//
11531// This prevents us from seeing a constant offset that can be folded into a
11532// memory instruction's addressing mode. If we know the resulting add offset of
11533// a pointer can be folded into an addressing offset, we can replace the pointer
11534// operand with the add of new constant offset. This eliminates one of the uses,
11535// and may allow the remaining use to also be simplified.
11536//
11537SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11538 EVT MemVT,
11539 DAGCombinerInfo &DCI) const {
11540 SDValue N0 = N->getOperand(0);
11541 SDValue N1 = N->getOperand(1);
11542
11543 // We only do this to handle cases where it's profitable when there are
11544 // multiple uses of the add, so defer to the standard combine.
11545 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11546 N0->hasOneUse())
11547 return SDValue();
11548
11549 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11550 if (!CN1)
11551 return SDValue();
11552
11553 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11554 if (!CAdd)
11555 return SDValue();
11556
11557 SelectionDAG &DAG = DCI.DAG;
11558
11559 if (N0->getOpcode() == ISD::OR &&
11560 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11561 return SDValue();
11562
11563 // If the resulting offset is too large, we can't fold it into the
11564 // addressing mode offset.
11565 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11566 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11567
11568 AddrMode AM;
11569 AM.HasBaseReg = true;
11570 AM.BaseOffs = Offset.getSExtValue();
11571 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11572 return SDValue();
11573
11574 SDLoc SL(N);
11575 EVT VT = N->getValueType(0);
11576
11577 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11578 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11579
11581 Flags.setNoUnsignedWrap(
11582 N->getFlags().hasNoUnsignedWrap() &&
11583 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11584
11585 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11586}
11587
11588/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11589/// by the chain and intrinsic ID. Theoretically we would also need to check the
11590/// specific intrinsic, but they all place the pointer operand first.
11591static unsigned getBasePtrIndex(const MemSDNode *N) {
11592 switch (N->getOpcode()) {
11593 case ISD::STORE:
11596 return 2;
11597 default:
11598 return 1;
11599 }
11600}
11601
11602SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11603 DAGCombinerInfo &DCI) const {
11604 SelectionDAG &DAG = DCI.DAG;
11605 SDLoc SL(N);
11606
11607 unsigned PtrIdx = getBasePtrIndex(N);
11608 SDValue Ptr = N->getOperand(PtrIdx);
11609
11610 // TODO: We could also do this for multiplies.
11611 if (Ptr.getOpcode() == ISD::SHL) {
11612 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11613 N->getMemoryVT(), DCI);
11614 if (NewPtr) {
11615 SmallVector<SDValue, 8> NewOps(N->ops());
11616
11617 NewOps[PtrIdx] = NewPtr;
11618 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11619 }
11620 }
11621
11622 return SDValue();
11623}
11624
11625static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11626 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11627 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11628 (Opc == ISD::XOR && Val == 0);
11629}
11630
11631// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11632// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11633// integer combine opportunities since most 64-bit operations are decomposed
11634// this way. TODO: We won't want this for SALU especially if it is an inline
11635// immediate.
11636SDValue SITargetLowering::splitBinaryBitConstantOp(
11637 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11638 const ConstantSDNode *CRHS) const {
11639 uint64_t Val = CRHS->getZExtValue();
11640 uint32_t ValLo = Lo_32(Val);
11641 uint32_t ValHi = Hi_32(Val);
11643
11644 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11645 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11646 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11647 // If we need to materialize a 64-bit immediate, it will be split up later
11648 // anyway. Avoid creating the harder to understand 64-bit immediate
11649 // materialization.
11650 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11651 }
11652
11653 return SDValue();
11654}
11655
11657 if (V.getValueType() != MVT::i1)
11658 return false;
11659 switch (V.getOpcode()) {
11660 default:
11661 break;
11662 case ISD::SETCC:
11664 return true;
11665 case ISD::AND:
11666 case ISD::OR:
11667 case ISD::XOR:
11668 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11669 }
11670 return false;
11671}
11672
11673// If a constant has all zeroes or all ones within each byte return it.
11674// Otherwise return 0.
11676 // 0xff for any zero byte in the mask
11677 uint32_t ZeroByteMask = 0;
11678 if (!(C & 0x000000ff))
11679 ZeroByteMask |= 0x000000ff;
11680 if (!(C & 0x0000ff00))
11681 ZeroByteMask |= 0x0000ff00;
11682 if (!(C & 0x00ff0000))
11683 ZeroByteMask |= 0x00ff0000;
11684 if (!(C & 0xff000000))
11685 ZeroByteMask |= 0xff000000;
11686 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11687 if ((NonZeroByteMask & C) != NonZeroByteMask)
11688 return 0; // Partial bytes selected.
11689 return C;
11690}
11691
11692// Check if a node selects whole bytes from its operand 0 starting at a byte
11693// boundary while masking the rest. Returns select mask as in the v_perm_b32
11694// or -1 if not succeeded.
11695// Note byte select encoding:
11696// value 0-3 selects corresponding source byte;
11697// value 0xc selects zero;
11698// value 0xff selects 0xff.
11700 assert(V.getValueSizeInBits() == 32);
11701
11702 if (V.getNumOperands() != 2)
11703 return ~0;
11704
11705 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11706 if (!N1)
11707 return ~0;
11708
11709 uint32_t C = N1->getZExtValue();
11710
11711 switch (V.getOpcode()) {
11712 default:
11713 break;
11714 case ISD::AND:
11715 if (uint32_t ConstMask = getConstantPermuteMask(C))
11716 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11717 break;
11718
11719 case ISD::OR:
11720 if (uint32_t ConstMask = getConstantPermuteMask(C))
11721 return (0x03020100 & ~ConstMask) | ConstMask;
11722 break;
11723
11724 case ISD::SHL:
11725 if (C % 8)
11726 return ~0;
11727
11728 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11729
11730 case ISD::SRL:
11731 if (C % 8)
11732 return ~0;
11733
11734 return uint32_t(0x0c0c0c0c03020100ull >> C);
11735 }
11736
11737 return ~0;
11738}
11739
11740SDValue SITargetLowering::performAndCombine(SDNode *N,
11741 DAGCombinerInfo &DCI) const {
11742 if (DCI.isBeforeLegalize())
11743 return SDValue();
11744
11745 SelectionDAG &DAG = DCI.DAG;
11746 EVT VT = N->getValueType(0);
11747 SDValue LHS = N->getOperand(0);
11748 SDValue RHS = N->getOperand(1);
11749
11750 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11751 if (VT == MVT::i64 && CRHS) {
11752 if (SDValue Split =
11753 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11754 return Split;
11755 }
11756
11757 if (CRHS && VT == MVT::i32) {
11758 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11759 // nb = number of trailing zeroes in mask
11760 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11761 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11762 uint64_t Mask = CRHS->getZExtValue();
11763 unsigned Bits = llvm::popcount(Mask);
11764 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11765 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11766 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11767 unsigned Shift = CShift->getZExtValue();
11768 unsigned NB = CRHS->getAPIntValue().countr_zero();
11769 unsigned Offset = NB + Shift;
11770 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11771 SDLoc SL(N);
11772 SDValue BFE =
11773 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11774 DAG.getConstant(Offset, SL, MVT::i32),
11775 DAG.getConstant(Bits, SL, MVT::i32));
11776 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11777 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11778 DAG.getValueType(NarrowVT));
11779 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11780 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11781 return Shl;
11782 }
11783 }
11784 }
11785
11786 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11787 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11788 isa<ConstantSDNode>(LHS.getOperand(2))) {
11789 uint32_t Sel = getConstantPermuteMask(Mask);
11790 if (!Sel)
11791 return SDValue();
11792
11793 // Select 0xc for all zero bytes
11794 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11795 SDLoc DL(N);
11796 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11797 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11798 }
11799 }
11800
11801 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11802 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11803 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11804 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11805 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11806
11807 SDValue X = LHS.getOperand(0);
11808 SDValue Y = RHS.getOperand(0);
11809 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11810 !isTypeLegal(X.getValueType()))
11811 return SDValue();
11812
11813 if (LCC == ISD::SETO) {
11814 if (X != LHS.getOperand(1))
11815 return SDValue();
11816
11817 if (RCC == ISD::SETUNE) {
11818 const ConstantFPSDNode *C1 =
11819 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11820 if (!C1 || !C1->isInfinity() || C1->isNegative())
11821 return SDValue();
11822
11827
11828 static_assert(
11831 0x3ff) == Mask,
11832 "mask not equal");
11833
11834 SDLoc DL(N);
11835 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11836 DAG.getConstant(Mask, DL, MVT::i32));
11837 }
11838 }
11839 }
11840
11841 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11842 std::swap(LHS, RHS);
11843
11844 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11845 RHS.hasOneUse()) {
11846 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11847 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11848 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11849 // | n_nan)
11850 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11851 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11852 (RHS.getOperand(0) == LHS.getOperand(0) &&
11853 LHS.getOperand(0) == LHS.getOperand(1))) {
11854 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11855 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11856 : Mask->getZExtValue() & OrdMask;
11857
11858 SDLoc DL(N);
11859 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11860 DAG.getConstant(NewMask, DL, MVT::i32));
11861 }
11862 }
11863
11864 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11865 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11866 // and x, (sext cc from i1) => select cc, x, 0
11867 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11868 std::swap(LHS, RHS);
11869 if (isBoolSGPR(RHS.getOperand(0)))
11870 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11871 DAG.getConstant(0, SDLoc(N), MVT::i32));
11872 }
11873
11874 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11876 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11877 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11878 uint32_t LHSMask = getPermuteMask(LHS);
11879 uint32_t RHSMask = getPermuteMask(RHS);
11880 if (LHSMask != ~0u && RHSMask != ~0u) {
11881 // Canonicalize the expression in an attempt to have fewer unique masks
11882 // and therefore fewer registers used to hold the masks.
11883 if (LHSMask > RHSMask) {
11884 std::swap(LHSMask, RHSMask);
11885 std::swap(LHS, RHS);
11886 }
11887
11888 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11889 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11890 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11891 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11892
11893 // Check of we need to combine values from two sources within a byte.
11894 if (!(LHSUsedLanes & RHSUsedLanes) &&
11895 // If we select high and lower word keep it for SDWA.
11896 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11897 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11898 // Each byte in each mask is either selector mask 0-3, or has higher
11899 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11900 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11901 // mask which is not 0xff wins. By anding both masks we have a correct
11902 // result except that 0x0c shall be corrected to give 0x0c only.
11903 uint32_t Mask = LHSMask & RHSMask;
11904 for (unsigned I = 0; I < 32; I += 8) {
11905 uint32_t ByteSel = 0xff << I;
11906 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11907 Mask &= (0x0c << I) & 0xffffffff;
11908 }
11909
11910 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11911 // or 0x0c.
11912 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11913 SDLoc DL(N);
11914
11915 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11916 RHS.getOperand(0),
11917 DAG.getConstant(Sel, DL, MVT::i32));
11918 }
11919 }
11920 }
11921
11922 return SDValue();
11923}
11924
11925// A key component of v_perm is a mapping between byte position of the src
11926// operands, and the byte position of the dest. To provide such, we need: 1. the
11927// node that provides x byte of the dest of the OR, and 2. the byte of the node
11928// used to provide that x byte. calculateByteProvider finds which node provides
11929// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11930// and finds an ultimate src and byte position For example: The supported
11931// LoadCombine pattern for vector loads is as follows
11932// t1
11933// or
11934// / \
11935// t2 t3
11936// zext shl
11937// | | \
11938// t4 t5 16
11939// or anyext
11940// / \ |
11941// t6 t7 t8
11942// srl shl or
11943// / | / \ / \
11944// t9 t10 t11 t12 t13 t14
11945// trunc* 8 trunc* 8 and and
11946// | | / | | \
11947// t15 t16 t17 t18 t19 t20
11948// trunc* 255 srl -256
11949// | / \
11950// t15 t15 16
11951//
11952// *In this example, the truncs are from i32->i16
11953//
11954// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11955// respectively. calculateSrcByte would find (given node) -> ultimate src &
11956// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11957// After finding the mapping, we can combine the tree into vperm t15, t16,
11958// 0x05000407
11959
11960// Find the source and byte position from a node.
11961// \p DestByte is the byte position of the dest of the or that the src
11962// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11963// dest of the or byte. \p Depth tracks how many recursive iterations we have
11964// performed.
11965static const std::optional<ByteProvider<SDValue>>
11966calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11967 unsigned Depth = 0) {
11968 // We may need to recursively traverse a series of SRLs
11969 if (Depth >= 6)
11970 return std::nullopt;
11971
11972 if (Op.getValueSizeInBits() < 8)
11973 return std::nullopt;
11974
11975 if (Op.getValueType().isVector())
11976 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11977
11978 switch (Op->getOpcode()) {
11979 case ISD::TRUNCATE: {
11980 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11981 }
11982
11983 case ISD::SIGN_EXTEND:
11984 case ISD::ZERO_EXTEND:
11986 SDValue NarrowOp = Op->getOperand(0);
11987 auto NarrowVT = NarrowOp.getValueType();
11988 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11989 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11990 NarrowVT = VTSign->getVT();
11991 }
11992 if (!NarrowVT.isByteSized())
11993 return std::nullopt;
11994 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11995
11996 if (SrcIndex >= NarrowByteWidth)
11997 return std::nullopt;
11998 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11999 }
12000
12001 case ISD::SRA:
12002 case ISD::SRL: {
12003 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12004 if (!ShiftOp)
12005 return std::nullopt;
12006
12007 uint64_t BitShift = ShiftOp->getZExtValue();
12008
12009 if (BitShift % 8 != 0)
12010 return std::nullopt;
12011
12012 SrcIndex += BitShift / 8;
12013
12014 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12015 }
12016
12017 default: {
12018 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12019 }
12020 }
12021 llvm_unreachable("fully handled switch");
12022}
12023
12024// For a byte position in the result of an Or, traverse the tree and find the
12025// node (and the byte of the node) which ultimately provides this {Or,
12026// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12027// the byte position of the Op that corresponds with the originally requested
12028// byte of the Or \p Depth tracks how many recursive iterations we have
12029// performed. \p StartingIndex is the originally requested byte of the Or
12030static const std::optional<ByteProvider<SDValue>>
12031calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12032 unsigned StartingIndex = 0) {
12033 // Finding Src tree of RHS of or typically requires at least 1 additional
12034 // depth
12035 if (Depth > 6)
12036 return std::nullopt;
12037
12038 unsigned BitWidth = Op.getScalarValueSizeInBits();
12039 if (BitWidth % 8 != 0)
12040 return std::nullopt;
12041 if (Index > BitWidth / 8 - 1)
12042 return std::nullopt;
12043
12044 bool IsVec = Op.getValueType().isVector();
12045 switch (Op.getOpcode()) {
12046 case ISD::OR: {
12047 if (IsVec)
12048 return std::nullopt;
12049
12050 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12051 StartingIndex);
12052 if (!RHS)
12053 return std::nullopt;
12054 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12055 StartingIndex);
12056 if (!LHS)
12057 return std::nullopt;
12058 // A well formed Or will have two ByteProviders for each byte, one of which
12059 // is constant zero
12060 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12061 return std::nullopt;
12062 if (!LHS || LHS->isConstantZero())
12063 return RHS;
12064 if (!RHS || RHS->isConstantZero())
12065 return LHS;
12066 return std::nullopt;
12067 }
12068
12069 case ISD::AND: {
12070 if (IsVec)
12071 return std::nullopt;
12072
12073 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12074 if (!BitMaskOp)
12075 return std::nullopt;
12076
12077 uint32_t BitMask = BitMaskOp->getZExtValue();
12078 // Bits we expect for our StartingIndex
12079 uint32_t IndexMask = 0xFF << (Index * 8);
12080
12081 if ((IndexMask & BitMask) != IndexMask) {
12082 // If the result of the and partially provides the byte, then it
12083 // is not well formatted
12084 if (IndexMask & BitMask)
12085 return std::nullopt;
12087 }
12088
12089 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12090 }
12091
12092 case ISD::FSHR: {
12093 if (IsVec)
12094 return std::nullopt;
12095
12096 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12097 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12098 if (!ShiftOp || Op.getValueType().isVector())
12099 return std::nullopt;
12100
12101 uint64_t BitsProvided = Op.getValueSizeInBits();
12102 if (BitsProvided % 8 != 0)
12103 return std::nullopt;
12104
12105 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12106 if (BitShift % 8)
12107 return std::nullopt;
12108
12109 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12110 uint64_t ByteShift = BitShift / 8;
12111
12112 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12113 uint64_t BytesProvided = BitsProvided / 8;
12114 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12115 NewIndex %= BytesProvided;
12116 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12117 }
12118
12119 case ISD::SRA:
12120 case ISD::SRL: {
12121 if (IsVec)
12122 return std::nullopt;
12123
12124 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12125 if (!ShiftOp)
12126 return std::nullopt;
12127
12128 uint64_t BitShift = ShiftOp->getZExtValue();
12129 if (BitShift % 8)
12130 return std::nullopt;
12131
12132 auto BitsProvided = Op.getScalarValueSizeInBits();
12133 if (BitsProvided % 8 != 0)
12134 return std::nullopt;
12135
12136 uint64_t BytesProvided = BitsProvided / 8;
12137 uint64_t ByteShift = BitShift / 8;
12138 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12139 // If the byte we are trying to provide (as tracked by index) falls in this
12140 // range, then the SRL provides the byte. The byte of interest of the src of
12141 // the SRL is Index + ByteShift
12142 return BytesProvided - ByteShift > Index
12143 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12144 Index + ByteShift)
12146 }
12147
12148 case ISD::SHL: {
12149 if (IsVec)
12150 return std::nullopt;
12151
12152 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12153 if (!ShiftOp)
12154 return std::nullopt;
12155
12156 uint64_t BitShift = ShiftOp->getZExtValue();
12157 if (BitShift % 8 != 0)
12158 return std::nullopt;
12159 uint64_t ByteShift = BitShift / 8;
12160
12161 // If we are shifting by an amount greater than (or equal to)
12162 // the index we are trying to provide, then it provides 0s. If not,
12163 // then this bytes are not definitively 0s, and the corresponding byte
12164 // of interest is Index - ByteShift of the src
12165 return Index < ByteShift
12167 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12168 Depth + 1, StartingIndex);
12169 }
12170 case ISD::ANY_EXTEND:
12171 case ISD::SIGN_EXTEND:
12172 case ISD::ZERO_EXTEND:
12174 case ISD::AssertZext:
12175 case ISD::AssertSext: {
12176 if (IsVec)
12177 return std::nullopt;
12178
12179 SDValue NarrowOp = Op->getOperand(0);
12180 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12181 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12182 Op->getOpcode() == ISD::AssertZext ||
12183 Op->getOpcode() == ISD::AssertSext) {
12184 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12185 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12186 }
12187 if (NarrowBitWidth % 8 != 0)
12188 return std::nullopt;
12189 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12190
12191 if (Index >= NarrowByteWidth)
12192 return Op.getOpcode() == ISD::ZERO_EXTEND
12193 ? std::optional<ByteProvider<SDValue>>(
12195 : std::nullopt;
12196 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12197 }
12198
12199 case ISD::TRUNCATE: {
12200 if (IsVec)
12201 return std::nullopt;
12202
12203 uint64_t NarrowByteWidth = BitWidth / 8;
12204
12205 if (NarrowByteWidth >= Index) {
12206 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12207 StartingIndex);
12208 }
12209
12210 return std::nullopt;
12211 }
12212
12213 case ISD::CopyFromReg: {
12214 if (BitWidth / 8 > Index)
12215 return calculateSrcByte(Op, StartingIndex, Index);
12216
12217 return std::nullopt;
12218 }
12219
12220 case ISD::LOAD: {
12221 auto *L = cast<LoadSDNode>(Op.getNode());
12222
12223 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12224 if (NarrowBitWidth % 8 != 0)
12225 return std::nullopt;
12226 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12227
12228 // If the width of the load does not reach byte we are trying to provide for
12229 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12230 // question
12231 if (Index >= NarrowByteWidth) {
12232 return L->getExtensionType() == ISD::ZEXTLOAD
12233 ? std::optional<ByteProvider<SDValue>>(
12235 : std::nullopt;
12236 }
12237
12238 if (NarrowByteWidth > Index) {
12239 return calculateSrcByte(Op, StartingIndex, Index);
12240 }
12241
12242 return std::nullopt;
12243 }
12244
12245 case ISD::BSWAP: {
12246 if (IsVec)
12247 return std::nullopt;
12248
12249 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12250 Depth + 1, StartingIndex);
12251 }
12252
12254 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12255 if (!IdxOp)
12256 return std::nullopt;
12257 auto VecIdx = IdxOp->getZExtValue();
12258 auto ScalarSize = Op.getScalarValueSizeInBits();
12259 if (ScalarSize < 32)
12260 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12261 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12262 StartingIndex, Index);
12263 }
12264
12265 case AMDGPUISD::PERM: {
12266 if (IsVec)
12267 return std::nullopt;
12268
12269 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12270 if (!PermMask)
12271 return std::nullopt;
12272
12273 auto IdxMask =
12274 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12275 if (IdxMask > 0x07 && IdxMask != 0x0c)
12276 return std::nullopt;
12277
12278 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12279 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12280
12281 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12284 }
12285
12286 default: {
12287 return std::nullopt;
12288 }
12289 }
12290
12291 llvm_unreachable("fully handled switch");
12292}
12293
12294// Returns true if the Operand is a scalar and is 16 bits
12295static bool isExtendedFrom16Bits(SDValue &Operand) {
12296
12297 switch (Operand.getOpcode()) {
12298 case ISD::ANY_EXTEND:
12299 case ISD::SIGN_EXTEND:
12300 case ISD::ZERO_EXTEND: {
12301 auto OpVT = Operand.getOperand(0).getValueType();
12302 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12303 }
12304 case ISD::LOAD: {
12305 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12306 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12307 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12308 ExtType == ISD::EXTLOAD) {
12309 auto MemVT = L->getMemoryVT();
12310 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12311 }
12312 return L->getMemoryVT().getSizeInBits() == 16;
12313 }
12314 default:
12315 return false;
12316 }
12317}
12318
12319// Returns true if the mask matches consecutive bytes, and the first byte
12320// begins at a power of 2 byte offset from 0th byte
12321static bool addresses16Bits(int Mask) {
12322 int Low8 = Mask & 0xff;
12323 int Hi8 = (Mask & 0xff00) >> 8;
12324
12325 assert(Low8 < 8 && Hi8 < 8);
12326 // Are the bytes contiguous in the order of increasing addresses.
12327 bool IsConsecutive = (Hi8 - Low8 == 1);
12328 // Is the first byte at location that is aligned for 16 bit instructions.
12329 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12330 // In this case, we still need code to extract the 16 bit operand, so it
12331 // is better to use i8 v_perm
12332 bool Is16Aligned = !(Low8 % 2);
12333
12334 return IsConsecutive && Is16Aligned;
12335}
12336
12337// Do not lower into v_perm if the operands are actually 16 bit
12338// and the selected bits (based on PermMask) correspond with two
12339// easily addressable 16 bit operands.
12341 SDValue &OtherOp) {
12342 int Low16 = PermMask & 0xffff;
12343 int Hi16 = (PermMask & 0xffff0000) >> 16;
12344
12345 auto TempOp = peekThroughBitcasts(Op);
12346 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12347
12348 auto OpIs16Bit =
12349 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12350 if (!OpIs16Bit)
12351 return true;
12352
12353 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12354 isExtendedFrom16Bits(TempOtherOp);
12355 if (!OtherOpIs16Bit)
12356 return true;
12357
12358 // Do we cleanly address both
12359 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12360}
12361
12363 unsigned DWordOffset) {
12364 SDValue Ret;
12365
12366 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12367 // ByteProvider must be at least 8 bits
12368 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12369
12370 if (TypeSize <= 32)
12371 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12372
12373 if (Src.getValueType().isVector()) {
12374 auto ScalarTySize = Src.getScalarValueSizeInBits();
12375 auto ScalarTy = Src.getValueType().getScalarType();
12376 if (ScalarTySize == 32) {
12377 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12378 DAG.getConstant(DWordOffset, SL, MVT::i32));
12379 }
12380 if (ScalarTySize > 32) {
12381 Ret = DAG.getNode(
12382 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12383 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12384 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12385 if (ShiftVal)
12386 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12387 DAG.getConstant(ShiftVal, SL, MVT::i32));
12388 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12389 }
12390
12391 assert(ScalarTySize < 32);
12392 auto NumElements = TypeSize / ScalarTySize;
12393 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12394 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12395 auto NumElementsIn32 = 32 / ScalarTySize;
12396 auto NumAvailElements = DWordOffset < Trunc32Elements
12397 ? NumElementsIn32
12398 : NumElements - NormalizedTrunc;
12399
12401 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12402 NumAvailElements);
12403
12404 Ret = DAG.getBuildVector(
12405 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12406 VecSrcs);
12407 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12408 }
12409
12410 /// Scalar Type
12411 auto ShiftVal = 32 * DWordOffset;
12412 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12413 DAG.getConstant(ShiftVal, SL, MVT::i32));
12414 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12415}
12416
12418 SelectionDAG &DAG = DCI.DAG;
12419 [[maybe_unused]] EVT VT = N->getValueType(0);
12421
12422 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12423 assert(VT == MVT::i32);
12424 for (int i = 0; i < 4; i++) {
12425 // Find the ByteProvider that provides the ith byte of the result of OR
12426 std::optional<ByteProvider<SDValue>> P =
12427 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12428 // TODO support constantZero
12429 if (!P || P->isConstantZero())
12430 return SDValue();
12431
12432 PermNodes.push_back(*P);
12433 }
12434 if (PermNodes.size() != 4)
12435 return SDValue();
12436
12437 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12438 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12439 uint64_t PermMask = 0x00000000;
12440 for (size_t i = 0; i < PermNodes.size(); i++) {
12441 auto PermOp = PermNodes[i];
12442 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12443 // by sizeof(Src2) = 4
12444 int SrcByteAdjust = 4;
12445
12446 // If the Src uses a byte from a different DWORD, then it corresponds
12447 // with a difference source
12448 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12449 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12450 if (SecondSrc)
12451 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12452 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12453 return SDValue();
12454
12455 // Set the index of the second distinct Src node
12456 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12457 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12458 SrcByteAdjust = 0;
12459 }
12460 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12462 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12463 }
12464 SDLoc DL(N);
12465 SDValue Op = *PermNodes[FirstSrc.first].Src;
12466 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12467 assert(Op.getValueSizeInBits() == 32);
12468
12469 // Check that we are not just extracting the bytes in order from an op
12470 if (!SecondSrc) {
12471 int Low16 = PermMask & 0xffff;
12472 int Hi16 = (PermMask & 0xffff0000) >> 16;
12473
12474 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12475 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12476
12477 // The perm op would really just produce Op. So combine into Op
12478 if (WellFormedLow && WellFormedHi)
12479 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12480 }
12481
12482 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12483
12484 if (SecondSrc) {
12485 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12486 assert(OtherOp.getValueSizeInBits() == 32);
12487 }
12488
12489 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12490
12491 assert(Op.getValueType().isByteSized() &&
12492 OtherOp.getValueType().isByteSized());
12493
12494 // If the ultimate src is less than 32 bits, then we will only be
12495 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12496 // CalculateByteProvider would not have returned Op as source if we
12497 // used a byte that is outside its ValueType. Thus, we are free to
12498 // ANY_EXTEND as the extended bits are dont-cares.
12499 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12500 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12501
12502 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12503 DAG.getConstant(PermMask, DL, MVT::i32));
12504 }
12505 return SDValue();
12506}
12507
12508SDValue SITargetLowering::performOrCombine(SDNode *N,
12509 DAGCombinerInfo &DCI) const {
12510 SelectionDAG &DAG = DCI.DAG;
12511 SDValue LHS = N->getOperand(0);
12512 SDValue RHS = N->getOperand(1);
12513
12514 EVT VT = N->getValueType(0);
12515 if (VT == MVT::i1) {
12516 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12517 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12518 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12519 SDValue Src = LHS.getOperand(0);
12520 if (Src != RHS.getOperand(0))
12521 return SDValue();
12522
12523 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12524 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12525 if (!CLHS || !CRHS)
12526 return SDValue();
12527
12528 // Only 10 bits are used.
12529 static const uint32_t MaxMask = 0x3ff;
12530
12531 uint32_t NewMask =
12532 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12533 SDLoc DL(N);
12534 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12535 DAG.getConstant(NewMask, DL, MVT::i32));
12536 }
12537
12538 return SDValue();
12539 }
12540
12541 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12542 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12543 LHS.getOpcode() == AMDGPUISD::PERM &&
12544 isa<ConstantSDNode>(LHS.getOperand(2))) {
12545 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12546 if (!Sel)
12547 return SDValue();
12548
12549 Sel |= LHS.getConstantOperandVal(2);
12550 SDLoc DL(N);
12551 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12552 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12553 }
12554
12555 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12557 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12558 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12559
12560 // If all the uses of an or need to extract the individual elements, do not
12561 // attempt to lower into v_perm
12562 auto usesCombinedOperand = [](SDNode *OrUse) {
12563 // If we have any non-vectorized use, then it is a candidate for v_perm
12564 if (OrUse->getOpcode() != ISD::BITCAST ||
12565 !OrUse->getValueType(0).isVector())
12566 return true;
12567
12568 // If we have any non-vectorized use, then it is a candidate for v_perm
12569 for (auto *VUser : OrUse->users()) {
12570 if (!VUser->getValueType(0).isVector())
12571 return true;
12572
12573 // If the use of a vector is a store, then combining via a v_perm
12574 // is beneficial.
12575 // TODO -- whitelist more uses
12576 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12577 if (VUser->getOpcode() == VectorwiseOp)
12578 return true;
12579 }
12580 return false;
12581 };
12582
12583 if (!any_of(N->users(), usesCombinedOperand))
12584 return SDValue();
12585
12586 uint32_t LHSMask = getPermuteMask(LHS);
12587 uint32_t RHSMask = getPermuteMask(RHS);
12588
12589 if (LHSMask != ~0u && RHSMask != ~0u) {
12590 // Canonicalize the expression in an attempt to have fewer unique masks
12591 // and therefore fewer registers used to hold the masks.
12592 if (LHSMask > RHSMask) {
12593 std::swap(LHSMask, RHSMask);
12594 std::swap(LHS, RHS);
12595 }
12596
12597 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12598 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12599 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12600 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12601
12602 // Check of we need to combine values from two sources within a byte.
12603 if (!(LHSUsedLanes & RHSUsedLanes) &&
12604 // If we select high and lower word keep it for SDWA.
12605 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12606 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12607 // Kill zero bytes selected by other mask. Zero value is 0xc.
12608 LHSMask &= ~RHSUsedLanes;
12609 RHSMask &= ~LHSUsedLanes;
12610 // Add 4 to each active LHS lane
12611 LHSMask |= LHSUsedLanes & 0x04040404;
12612 // Combine masks
12613 uint32_t Sel = LHSMask | RHSMask;
12614 SDLoc DL(N);
12615
12616 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12617 RHS.getOperand(0),
12618 DAG.getConstant(Sel, DL, MVT::i32));
12619 }
12620 }
12621 if (LHSMask == ~0u || RHSMask == ~0u) {
12622 if (SDValue Perm = matchPERM(N, DCI))
12623 return Perm;
12624 }
12625 }
12626
12627 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12628 return SDValue();
12629
12630 // TODO: This could be a generic combine with a predicate for extracting the
12631 // high half of an integer being free.
12632
12633 // (or i64:x, (zero_extend i32:y)) ->
12634 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12635 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12636 RHS.getOpcode() != ISD::ZERO_EXTEND)
12637 std::swap(LHS, RHS);
12638
12639 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12640 SDValue ExtSrc = RHS.getOperand(0);
12641 EVT SrcVT = ExtSrc.getValueType();
12642 if (SrcVT == MVT::i32) {
12643 SDLoc SL(N);
12644 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12645 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12646
12647 DCI.AddToWorklist(LowOr.getNode());
12648 DCI.AddToWorklist(HiBits.getNode());
12649
12650 SDValue Vec =
12651 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12652 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12653 }
12654 }
12655
12656 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12657 if (CRHS) {
12658 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12659 N->getOperand(0), CRHS))
12660 return Split;
12661 }
12662
12663 return SDValue();
12664}
12665
12666SDValue SITargetLowering::performXorCombine(SDNode *N,
12667 DAGCombinerInfo &DCI) const {
12668 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12669 return RV;
12670
12671 SDValue LHS = N->getOperand(0);
12672 SDValue RHS = N->getOperand(1);
12673
12674 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12675 SelectionDAG &DAG = DCI.DAG;
12676
12677 EVT VT = N->getValueType(0);
12678 if (CRHS && VT == MVT::i64) {
12679 if (SDValue Split =
12680 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12681 return Split;
12682 }
12683
12684 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12685 // fneg-like xors into 64-bit select.
12686 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12687 // This looks like an fneg, try to fold as a source modifier.
12688 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12689 shouldFoldFNegIntoSrc(N, LHS)) {
12690 // xor (select c, a, b), 0x80000000 ->
12691 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12692 SDLoc DL(N);
12693 SDValue CastLHS =
12694 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12695 SDValue CastRHS =
12696 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12697 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12698 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12699 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12700 LHS->getOperand(0), FNegLHS, FNegRHS);
12701 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12702 }
12703 }
12704
12705 return SDValue();
12706}
12707
12708SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12709 DAGCombinerInfo &DCI) const {
12710 if (!Subtarget->has16BitInsts() ||
12711 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12712 return SDValue();
12713
12714 EVT VT = N->getValueType(0);
12715 if (VT != MVT::i32)
12716 return SDValue();
12717
12718 SDValue Src = N->getOperand(0);
12719 if (Src.getValueType() != MVT::i16)
12720 return SDValue();
12721
12722 return SDValue();
12723}
12724
12725SDValue
12726SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12727 DAGCombinerInfo &DCI) const {
12728 SDValue Src = N->getOperand(0);
12729 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12730
12731 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12732 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12733 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12734 VTSign->getVT() == MVT::i8) ||
12735 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12736 VTSign->getVT() == MVT::i16))) {
12737 assert(Subtarget->hasScalarSubwordLoads() &&
12738 "s_buffer_load_{u8, i8} are supported "
12739 "in GFX12 (or newer) architectures.");
12740 EVT VT = Src.getValueType();
12741 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12744 SDLoc DL(N);
12745 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12746 SDValue Ops[] = {
12747 Src.getOperand(0), // source register
12748 Src.getOperand(1), // offset
12749 Src.getOperand(2) // cachePolicy
12750 };
12751 auto *M = cast<MemSDNode>(Src);
12752 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12753 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12754 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12755 return LoadVal;
12756 }
12757 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12758 VTSign->getVT() == MVT::i8) ||
12759 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12760 VTSign->getVT() == MVT::i16)) &&
12761 Src.hasOneUse()) {
12762 auto *M = cast<MemSDNode>(Src);
12763 SDValue Ops[] = {Src.getOperand(0), // Chain
12764 Src.getOperand(1), // rsrc
12765 Src.getOperand(2), // vindex
12766 Src.getOperand(3), // voffset
12767 Src.getOperand(4), // soffset
12768 Src.getOperand(5), // offset
12769 Src.getOperand(6), Src.getOperand(7)};
12770 // replace with BUFFER_LOAD_BYTE/SHORT
12771 SDVTList ResList =
12772 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12773 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12776 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12777 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12778 return DCI.DAG.getMergeValues(
12779 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12780 }
12781 return SDValue();
12782}
12783
12784SDValue SITargetLowering::performClassCombine(SDNode *N,
12785 DAGCombinerInfo &DCI) const {
12786 SelectionDAG &DAG = DCI.DAG;
12787 SDValue Mask = N->getOperand(1);
12788
12789 // fp_class x, 0 -> false
12790 if (isNullConstant(Mask))
12791 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12792
12793 if (N->getOperand(0).isUndef())
12794 return DAG.getUNDEF(MVT::i1);
12795
12796 return SDValue();
12797}
12798
12799SDValue SITargetLowering::performRcpCombine(SDNode *N,
12800 DAGCombinerInfo &DCI) const {
12801 EVT VT = N->getValueType(0);
12802 SDValue N0 = N->getOperand(0);
12803
12804 if (N0.isUndef()) {
12805 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12806 SDLoc(N), VT);
12807 }
12808
12809 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12810 N0.getOpcode() == ISD::SINT_TO_FP)) {
12811 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12812 N->getFlags());
12813 }
12814
12815 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12816 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12817 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12818 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12819 N->getFlags());
12820 }
12821
12823}
12824
12826 unsigned MaxDepth) const {
12827 unsigned Opcode = Op.getOpcode();
12828 if (Opcode == ISD::FCANONICALIZE)
12829 return true;
12830
12831 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12832 const auto &F = CFP->getValueAPF();
12833 if (F.isNaN() && F.isSignaling())
12834 return false;
12835 if (!F.isDenormal())
12836 return true;
12837
12838 DenormalMode Mode =
12839 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12840 return Mode == DenormalMode::getIEEE();
12841 }
12842
12843 // If source is a result of another standard FP operation it is already in
12844 // canonical form.
12845 if (MaxDepth == 0)
12846 return false;
12847
12848 switch (Opcode) {
12849 // These will flush denorms if required.
12850 case ISD::FADD:
12851 case ISD::FSUB:
12852 case ISD::FMUL:
12853 case ISD::FCEIL:
12854 case ISD::FFLOOR:
12855 case ISD::FMA:
12856 case ISD::FMAD:
12857 case ISD::FSQRT:
12858 case ISD::FDIV:
12859 case ISD::FREM:
12860 case ISD::FP_ROUND:
12861 case ISD::FP_EXTEND:
12862 case ISD::FP16_TO_FP:
12863 case ISD::FP_TO_FP16:
12864 case ISD::BF16_TO_FP:
12865 case ISD::FP_TO_BF16:
12866 case ISD::FLDEXP:
12869 case AMDGPUISD::RCP:
12870 case AMDGPUISD::RSQ:
12874 case AMDGPUISD::LOG:
12875 case AMDGPUISD::EXP:
12879 case AMDGPUISD::FRACT:
12886 case AMDGPUISD::SIN_HW:
12887 case AMDGPUISD::COS_HW:
12888 return true;
12889
12890 // It can/will be lowered or combined as a bit operation.
12891 // Need to check their input recursively to handle.
12892 case ISD::FNEG:
12893 case ISD::FABS:
12894 case ISD::FCOPYSIGN:
12895 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12896
12897 case ISD::AND:
12898 if (Op.getValueType() == MVT::i32) {
12899 // Be careful as we only know it is a bitcast floating point type. It
12900 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12901 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12902 // is valid to optimize for all types.
12903 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12904 if (RHS->getZExtValue() == 0xffff0000) {
12905 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12906 }
12907 }
12908 }
12909 break;
12910
12911 case ISD::FSIN:
12912 case ISD::FCOS:
12913 case ISD::FSINCOS:
12914 return Op.getValueType().getScalarType() != MVT::f16;
12915
12916 case ISD::FMINNUM:
12917 case ISD::FMAXNUM:
12918 case ISD::FMINNUM_IEEE:
12919 case ISD::FMAXNUM_IEEE:
12920 case ISD::FMINIMUM:
12921 case ISD::FMAXIMUM:
12922 case AMDGPUISD::CLAMP:
12923 case AMDGPUISD::FMED3:
12924 case AMDGPUISD::FMAX3:
12925 case AMDGPUISD::FMIN3:
12927 case AMDGPUISD::FMINIMUM3: {
12928 // FIXME: Shouldn't treat the generic operations different based these.
12929 // However, we aren't really required to flush the result from
12930 // minnum/maxnum..
12931
12932 // snans will be quieted, so we only need to worry about denormals.
12933 if (Subtarget->supportsMinMaxDenormModes() ||
12934 // FIXME: denormalsEnabledForType is broken for dynamic
12935 denormalsEnabledForType(DAG, Op.getValueType()))
12936 return true;
12937
12938 // Flushing may be required.
12939 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12940 // targets need to check their input recursively.
12941
12942 // FIXME: Does this apply with clamp? It's implemented with max.
12943 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12944 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12945 return false;
12946 }
12947
12948 return true;
12949 }
12950 case ISD::SELECT: {
12951 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12952 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12953 }
12954 case ISD::BUILD_VECTOR: {
12955 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12956 SDValue SrcOp = Op.getOperand(i);
12957 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12958 return false;
12959 }
12960
12961 return true;
12962 }
12965 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12966 }
12968 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12969 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12970 }
12971 case ISD::UNDEF:
12972 // Could be anything.
12973 return false;
12974
12975 case ISD::BITCAST:
12976 // TODO: This is incorrect as it loses track of the operand's type. We may
12977 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12978 // same bits that are canonicalized in one type need not be in the other.
12979 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12980 case ISD::TRUNCATE: {
12981 // Hack round the mess we make when legalizing extract_vector_elt
12982 if (Op.getValueType() == MVT::i16) {
12983 SDValue TruncSrc = Op.getOperand(0);
12984 if (TruncSrc.getValueType() == MVT::i32 &&
12985 TruncSrc.getOpcode() == ISD::BITCAST &&
12986 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12987 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12988 }
12989 }
12990 return false;
12991 }
12993 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12994 // TODO: Handle more intrinsics
12995 switch (IntrinsicID) {
12996 case Intrinsic::amdgcn_cvt_pkrtz:
12997 case Intrinsic::amdgcn_cubeid:
12998 case Intrinsic::amdgcn_frexp_mant:
12999 case Intrinsic::amdgcn_fdot2:
13000 case Intrinsic::amdgcn_rcp:
13001 case Intrinsic::amdgcn_rsq:
13002 case Intrinsic::amdgcn_rsq_clamp:
13003 case Intrinsic::amdgcn_rcp_legacy:
13004 case Intrinsic::amdgcn_rsq_legacy:
13005 case Intrinsic::amdgcn_trig_preop:
13006 case Intrinsic::amdgcn_log:
13007 case Intrinsic::amdgcn_exp2:
13008 case Intrinsic::amdgcn_sqrt:
13009 return true;
13010 default:
13011 break;
13012 }
13013
13014 break;
13015 }
13016 default:
13017 break;
13018 }
13019
13020 // FIXME: denormalsEnabledForType is broken for dynamic
13021 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13022 DAG.isKnownNeverSNaN(Op);
13023}
13024
13026 unsigned MaxDepth) const {
13027 const MachineRegisterInfo &MRI = MF.getRegInfo();
13028 MachineInstr *MI = MRI.getVRegDef(Reg);
13029 unsigned Opcode = MI->getOpcode();
13030
13031 if (Opcode == AMDGPU::G_FCANONICALIZE)
13032 return true;
13033
13034 std::optional<FPValueAndVReg> FCR;
13035 // Constant splat (can be padded with undef) or scalar constant.
13036 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13037 if (FCR->Value.isSignaling())
13038 return false;
13039 if (!FCR->Value.isDenormal())
13040 return true;
13041
13042 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13043 return Mode == DenormalMode::getIEEE();
13044 }
13045
13046 if (MaxDepth == 0)
13047 return false;
13048
13049 switch (Opcode) {
13050 case AMDGPU::G_FADD:
13051 case AMDGPU::G_FSUB:
13052 case AMDGPU::G_FMUL:
13053 case AMDGPU::G_FCEIL:
13054 case AMDGPU::G_FFLOOR:
13055 case AMDGPU::G_FRINT:
13056 case AMDGPU::G_FNEARBYINT:
13057 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13058 case AMDGPU::G_INTRINSIC_TRUNC:
13059 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13060 case AMDGPU::G_FMA:
13061 case AMDGPU::G_FMAD:
13062 case AMDGPU::G_FSQRT:
13063 case AMDGPU::G_FDIV:
13064 case AMDGPU::G_FREM:
13065 case AMDGPU::G_FPOW:
13066 case AMDGPU::G_FPEXT:
13067 case AMDGPU::G_FLOG:
13068 case AMDGPU::G_FLOG2:
13069 case AMDGPU::G_FLOG10:
13070 case AMDGPU::G_FPTRUNC:
13071 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13072 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13073 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13074 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13075 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13076 return true;
13077 case AMDGPU::G_FNEG:
13078 case AMDGPU::G_FABS:
13079 case AMDGPU::G_FCOPYSIGN:
13080 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13081 case AMDGPU::G_FMINNUM:
13082 case AMDGPU::G_FMAXNUM:
13083 case AMDGPU::G_FMINNUM_IEEE:
13084 case AMDGPU::G_FMAXNUM_IEEE:
13085 case AMDGPU::G_FMINIMUM:
13086 case AMDGPU::G_FMAXIMUM: {
13087 if (Subtarget->supportsMinMaxDenormModes() ||
13088 // FIXME: denormalsEnabledForType is broken for dynamic
13089 denormalsEnabledForType(MRI.getType(Reg), MF))
13090 return true;
13091
13092 [[fallthrough]];
13093 }
13094 case AMDGPU::G_BUILD_VECTOR:
13095 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13096 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13097 return false;
13098 return true;
13099 case AMDGPU::G_INTRINSIC:
13100 case AMDGPU::G_INTRINSIC_CONVERGENT:
13101 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13102 case Intrinsic::amdgcn_fmul_legacy:
13103 case Intrinsic::amdgcn_fmad_ftz:
13104 case Intrinsic::amdgcn_sqrt:
13105 case Intrinsic::amdgcn_fmed3:
13106 case Intrinsic::amdgcn_sin:
13107 case Intrinsic::amdgcn_cos:
13108 case Intrinsic::amdgcn_log:
13109 case Intrinsic::amdgcn_exp2:
13110 case Intrinsic::amdgcn_log_clamp:
13111 case Intrinsic::amdgcn_rcp:
13112 case Intrinsic::amdgcn_rcp_legacy:
13113 case Intrinsic::amdgcn_rsq:
13114 case Intrinsic::amdgcn_rsq_clamp:
13115 case Intrinsic::amdgcn_rsq_legacy:
13116 case Intrinsic::amdgcn_div_scale:
13117 case Intrinsic::amdgcn_div_fmas:
13118 case Intrinsic::amdgcn_div_fixup:
13119 case Intrinsic::amdgcn_fract:
13120 case Intrinsic::amdgcn_cvt_pkrtz:
13121 case Intrinsic::amdgcn_cubeid:
13122 case Intrinsic::amdgcn_cubema:
13123 case Intrinsic::amdgcn_cubesc:
13124 case Intrinsic::amdgcn_cubetc:
13125 case Intrinsic::amdgcn_frexp_mant:
13126 case Intrinsic::amdgcn_fdot2:
13127 case Intrinsic::amdgcn_trig_preop:
13128 return true;
13129 default:
13130 break;
13131 }
13132
13133 [[fallthrough]];
13134 default:
13135 return false;
13136 }
13137
13138 llvm_unreachable("invalid operation");
13139}
13140
13141// Constant fold canonicalize.
13142SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13143 const SDLoc &SL, EVT VT,
13144 const APFloat &C) const {
13145 // Flush denormals to 0 if not enabled.
13146 if (C.isDenormal()) {
13147 DenormalMode Mode =
13148 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13149 if (Mode == DenormalMode::getPreserveSign()) {
13150 return DAG.getConstantFP(
13151 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13152 }
13153
13154 if (Mode != DenormalMode::getIEEE())
13155 return SDValue();
13156 }
13157
13158 if (C.isNaN()) {
13159 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13160 if (C.isSignaling()) {
13161 // Quiet a signaling NaN.
13162 // FIXME: Is this supposed to preserve payload bits?
13163 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13164 }
13165
13166 // Make sure it is the canonical NaN bitpattern.
13167 //
13168 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13169 // immediate?
13170 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13171 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13172 }
13173
13174 // Already canonical.
13175 return DAG.getConstantFP(C, SL, VT);
13176}
13177
13179 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13180}
13181
13182SDValue
13183SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13184 DAGCombinerInfo &DCI) const {
13185 SelectionDAG &DAG = DCI.DAG;
13186 SDValue N0 = N->getOperand(0);
13187 EVT VT = N->getValueType(0);
13188
13189 // fcanonicalize undef -> qnan
13190 if (N0.isUndef()) {
13192 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13193 }
13194
13195 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13196 EVT VT = N->getValueType(0);
13197 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13198 }
13199
13200 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13201 // (fcanonicalize k)
13202 //
13203 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13204
13205 // TODO: This could be better with wider vectors that will be split to v2f16,
13206 // and to consider uses since there aren't that many packed operations.
13207 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13208 isTypeLegal(MVT::v2f16)) {
13209 SDLoc SL(N);
13210 SDValue NewElts[2];
13211 SDValue Lo = N0.getOperand(0);
13212 SDValue Hi = N0.getOperand(1);
13213 EVT EltVT = Lo.getValueType();
13214
13216 for (unsigned I = 0; I != 2; ++I) {
13217 SDValue Op = N0.getOperand(I);
13218 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13219 NewElts[I] =
13220 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13221 } else if (Op.isUndef()) {
13222 // Handled below based on what the other operand is.
13223 NewElts[I] = Op;
13224 } else {
13225 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13226 }
13227 }
13228
13229 // If one half is undef, and one is constant, prefer a splat vector rather
13230 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13231 // cheaper to use and may be free with a packed operation.
13232 if (NewElts[0].isUndef()) {
13233 if (isa<ConstantFPSDNode>(NewElts[1]))
13234 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13235 ? NewElts[1]
13236 : DAG.getConstantFP(0.0f, SL, EltVT);
13237 }
13238
13239 if (NewElts[1].isUndef()) {
13240 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13241 ? NewElts[0]
13242 : DAG.getConstantFP(0.0f, SL, EltVT);
13243 }
13244
13245 return DAG.getBuildVector(VT, SL, NewElts);
13246 }
13247 }
13248
13249 return SDValue();
13250}
13251
13252static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13253 switch (Opc) {
13254 case ISD::FMAXNUM:
13255 case ISD::FMAXNUM_IEEE:
13256 return AMDGPUISD::FMAX3;
13257 case ISD::FMAXIMUM:
13258 return AMDGPUISD::FMAXIMUM3;
13259 case ISD::SMAX:
13260 return AMDGPUISD::SMAX3;
13261 case ISD::UMAX:
13262 return AMDGPUISD::UMAX3;
13263 case ISD::FMINNUM:
13264 case ISD::FMINNUM_IEEE:
13265 return AMDGPUISD::FMIN3;
13266 case ISD::FMINIMUM:
13267 return AMDGPUISD::FMINIMUM3;
13268 case ISD::SMIN:
13269 return AMDGPUISD::SMIN3;
13270 case ISD::UMIN:
13271 return AMDGPUISD::UMIN3;
13272 default:
13273 llvm_unreachable("Not a min/max opcode");
13274 }
13275}
13276
13277SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13278 const SDLoc &SL, SDValue Src,
13279 SDValue MinVal,
13280 SDValue MaxVal,
13281 bool Signed) const {
13282
13283 // med3 comes from
13284 // min(max(x, K0), K1), K0 < K1
13285 // max(min(x, K0), K1), K1 < K0
13286 //
13287 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13288 // min/max op.
13289 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13290 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13291
13292 if (!MinK || !MaxK)
13293 return SDValue();
13294
13295 if (Signed) {
13296 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13297 return SDValue();
13298 } else {
13299 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13300 return SDValue();
13301 }
13302
13303 EVT VT = MinK->getValueType(0);
13304 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13305 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13306 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13307
13308 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13309 // not available, but this is unlikely to be profitable as constants
13310 // will often need to be materialized & extended, especially on
13311 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13312 return SDValue();
13313}
13314
13316 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13317 return C;
13318
13319 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13320 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13321 return C;
13322 }
13323
13324 return nullptr;
13325}
13326
13327SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13328 const SDLoc &SL, SDValue Op0,
13329 SDValue Op1) const {
13331 if (!K1)
13332 return SDValue();
13333
13335 if (!K0)
13336 return SDValue();
13337
13338 // Ordered >= (although NaN inputs should have folded away by now).
13339 if (K0->getValueAPF() > K1->getValueAPF())
13340 return SDValue();
13341
13342 const MachineFunction &MF = DAG.getMachineFunction();
13344
13345 // TODO: Check IEEE bit enabled?
13346 EVT VT = Op0.getValueType();
13347 if (Info->getMode().DX10Clamp) {
13348 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13349 // hardware fmed3 behavior converting to a min.
13350 // FIXME: Should this be allowing -0.0?
13351 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13352 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13353 }
13354
13355 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13356 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13357 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13358 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13359 // then give the other result, which is different from med3 with a NaN
13360 // input.
13361 SDValue Var = Op0.getOperand(0);
13362 if (!DAG.isKnownNeverSNaN(Var))
13363 return SDValue();
13364
13366
13367 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13368 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13369 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13370 SDValue(K0, 0), SDValue(K1, 0));
13371 }
13372 }
13373
13374 return SDValue();
13375}
13376
13377/// \return true if the subtarget supports minimum3 and maximum3 with the given
13378/// base min/max opcode \p Opc for type \p VT.
13379static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13380 EVT VT) {
13381 switch (Opc) {
13382 case ISD::FMINNUM:
13383 case ISD::FMAXNUM:
13384 case ISD::FMINNUM_IEEE:
13385 case ISD::FMAXNUM_IEEE:
13388 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13389 case ISD::FMINIMUM:
13390 case ISD::FMAXIMUM:
13391 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13392 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13393 case ISD::SMAX:
13394 case ISD::SMIN:
13395 case ISD::UMAX:
13396 case ISD::UMIN:
13397 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13398 default:
13399 return false;
13400 }
13401
13402 llvm_unreachable("not a min/max opcode");
13403}
13404
13405SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13406 DAGCombinerInfo &DCI) const {
13407 SelectionDAG &DAG = DCI.DAG;
13408
13409 EVT VT = N->getValueType(0);
13410 unsigned Opc = N->getOpcode();
13411 SDValue Op0 = N->getOperand(0);
13412 SDValue Op1 = N->getOperand(1);
13413
13414 // Only do this if the inner op has one use since this will just increases
13415 // register pressure for no benefit.
13416
13417 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13418 // max(max(a, b), c) -> max3(a, b, c)
13419 // min(min(a, b), c) -> min3(a, b, c)
13420 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13421 SDLoc DL(N);
13422 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13423 Op0.getOperand(0), Op0.getOperand(1), Op1);
13424 }
13425
13426 // Try commuted.
13427 // max(a, max(b, c)) -> max3(a, b, c)
13428 // min(a, min(b, c)) -> min3(a, b, c)
13429 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13430 SDLoc DL(N);
13431 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13432 Op0, Op1.getOperand(0), Op1.getOperand(1));
13433 }
13434 }
13435
13436 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13437 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13438 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13439 if (SDValue Med3 = performIntMed3ImmCombine(
13440 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13441 return Med3;
13442 }
13443 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13444 if (SDValue Med3 = performIntMed3ImmCombine(
13445 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13446 return Med3;
13447 }
13448
13449 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13450 if (SDValue Med3 = performIntMed3ImmCombine(
13451 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13452 return Med3;
13453 }
13454 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13455 if (SDValue Med3 = performIntMed3ImmCombine(
13456 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13457 return Med3;
13458 }
13459
13460 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13461 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13462 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13463 (Opc == AMDGPUISD::FMIN_LEGACY &&
13464 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13465 (VT == MVT::f32 || VT == MVT::f64 ||
13466 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13467 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13468 Op0.hasOneUse()) {
13469 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13470 return Res;
13471 }
13472
13473 return SDValue();
13474}
13475
13477 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13478 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13479 // FIXME: Should this be allowing -0.0?
13480 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13481 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13482 }
13483 }
13484
13485 return false;
13486}
13487
13488// FIXME: Should only worry about snans for version with chain.
13489SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13490 DAGCombinerInfo &DCI) const {
13491 EVT VT = N->getValueType(0);
13492 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13493 // NaNs. With a NaN input, the order of the operands may change the result.
13494
13495 SelectionDAG &DAG = DCI.DAG;
13496 SDLoc SL(N);
13497
13498 SDValue Src0 = N->getOperand(0);
13499 SDValue Src1 = N->getOperand(1);
13500 SDValue Src2 = N->getOperand(2);
13501
13502 if (isClampZeroToOne(Src0, Src1)) {
13503 // const_a, const_b, x -> clamp is safe in all cases including signaling
13504 // nans.
13505 // FIXME: Should this be allowing -0.0?
13506 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13507 }
13508
13509 const MachineFunction &MF = DAG.getMachineFunction();
13511
13512 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13513 // handling no dx10-clamp?
13514 if (Info->getMode().DX10Clamp) {
13515 // If NaNs is clamped to 0, we are free to reorder the inputs.
13516
13517 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13518 std::swap(Src0, Src1);
13519
13520 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13521 std::swap(Src1, Src2);
13522
13523 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13524 std::swap(Src0, Src1);
13525
13526 if (isClampZeroToOne(Src1, Src2))
13527 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13528 }
13529
13530 return SDValue();
13531}
13532
13533SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13534 DAGCombinerInfo &DCI) const {
13535 SDValue Src0 = N->getOperand(0);
13536 SDValue Src1 = N->getOperand(1);
13537 if (Src0.isUndef() && Src1.isUndef())
13538 return DCI.DAG.getUNDEF(N->getValueType(0));
13539 return SDValue();
13540}
13541
13542// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13543// expanded into a set of cmp/select instructions.
13545 unsigned NumElem,
13546 bool IsDivergentIdx,
13547 const GCNSubtarget *Subtarget) {
13549 return false;
13550
13551 unsigned VecSize = EltSize * NumElem;
13552
13553 // Sub-dword vectors of size 2 dword or less have better implementation.
13554 if (VecSize <= 64 && EltSize < 32)
13555 return false;
13556
13557 // Always expand the rest of sub-dword instructions, otherwise it will be
13558 // lowered via memory.
13559 if (EltSize < 32)
13560 return true;
13561
13562 // Always do this if var-idx is divergent, otherwise it will become a loop.
13563 if (IsDivergentIdx)
13564 return true;
13565
13566 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13567 unsigned NumInsts = NumElem /* Number of compares */ +
13568 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13569
13570 // On some architectures (GFX9) movrel is not available and it's better
13571 // to expand.
13572 if (Subtarget->useVGPRIndexMode())
13573 return NumInsts <= 16;
13574
13575 // If movrel is available, use it instead of expanding for vector of 8
13576 // elements.
13577 if (Subtarget->hasMovrel())
13578 return NumInsts <= 15;
13579
13580 return true;
13581}
13582
13584 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13585 if (isa<ConstantSDNode>(Idx))
13586 return false;
13587
13588 SDValue Vec = N->getOperand(0);
13589 EVT VecVT = Vec.getValueType();
13590 EVT EltVT = VecVT.getVectorElementType();
13591 unsigned EltSize = EltVT.getSizeInBits();
13592 unsigned NumElem = VecVT.getVectorNumElements();
13593
13595 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13596}
13597
13598SDValue
13599SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13600 DAGCombinerInfo &DCI) const {
13601 SDValue Vec = N->getOperand(0);
13602 SelectionDAG &DAG = DCI.DAG;
13603
13604 EVT VecVT = Vec.getValueType();
13605 EVT VecEltVT = VecVT.getVectorElementType();
13606 EVT ResVT = N->getValueType(0);
13607
13608 unsigned VecSize = VecVT.getSizeInBits();
13609 unsigned VecEltSize = VecEltVT.getSizeInBits();
13610
13611 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13613 SDLoc SL(N);
13614 SDValue Idx = N->getOperand(1);
13615 SDValue Elt =
13616 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13617 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13618 }
13619
13620 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13621 // =>
13622 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13623 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13624 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13625 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13626 SDLoc SL(N);
13627 SDValue Idx = N->getOperand(1);
13628 unsigned Opc = Vec.getOpcode();
13629
13630 switch (Opc) {
13631 default:
13632 break;
13633 // TODO: Support other binary operations.
13634 case ISD::FADD:
13635 case ISD::FSUB:
13636 case ISD::FMUL:
13637 case ISD::ADD:
13638 case ISD::UMIN:
13639 case ISD::UMAX:
13640 case ISD::SMIN:
13641 case ISD::SMAX:
13642 case ISD::FMAXNUM:
13643 case ISD::FMINNUM:
13644 case ISD::FMAXNUM_IEEE:
13645 case ISD::FMINNUM_IEEE:
13646 case ISD::FMAXIMUM:
13647 case ISD::FMINIMUM: {
13648 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13649 Vec.getOperand(0), Idx);
13650 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13651 Vec.getOperand(1), Idx);
13652
13653 DCI.AddToWorklist(Elt0.getNode());
13654 DCI.AddToWorklist(Elt1.getNode());
13655 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13656 }
13657 }
13658 }
13659
13660 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13662 SDLoc SL(N);
13663 SDValue Idx = N->getOperand(1);
13664 SDValue V;
13665 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13666 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13667 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13668 if (I == 0)
13669 V = Elt;
13670 else
13671 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13672 }
13673 return V;
13674 }
13675
13676 if (!DCI.isBeforeLegalize())
13677 return SDValue();
13678
13679 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13680 // elements. This exposes more load reduction opportunities by replacing
13681 // multiple small extract_vector_elements with a single 32-bit extract.
13682 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13683 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13684 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13685 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13686
13687 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13688 unsigned EltIdx = BitIndex / 32;
13689 unsigned LeftoverBitIdx = BitIndex % 32;
13690 SDLoc SL(N);
13691
13692 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13693 DCI.AddToWorklist(Cast.getNode());
13694
13695 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13696 DAG.getConstant(EltIdx, SL, MVT::i32));
13697 DCI.AddToWorklist(Elt.getNode());
13698 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13699 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13700 DCI.AddToWorklist(Srl.getNode());
13701
13702 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13703 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13704 DCI.AddToWorklist(Trunc.getNode());
13705
13706 if (VecEltVT == ResVT) {
13707 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13708 }
13709
13710 assert(ResVT.isScalarInteger());
13711 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13712 }
13713
13714 return SDValue();
13715}
13716
13717SDValue
13718SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13719 DAGCombinerInfo &DCI) const {
13720 SDValue Vec = N->getOperand(0);
13721 SDValue Idx = N->getOperand(2);
13722 EVT VecVT = Vec.getValueType();
13723 EVT EltVT = VecVT.getVectorElementType();
13724
13725 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13726 // => BUILD_VECTOR n x select (e, const-idx)
13728 return SDValue();
13729
13730 SelectionDAG &DAG = DCI.DAG;
13731 SDLoc SL(N);
13732 SDValue Ins = N->getOperand(1);
13733 EVT IdxVT = Idx.getValueType();
13734
13736 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13737 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13738 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13739 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13740 Ops.push_back(V);
13741 }
13742
13743 return DAG.getBuildVector(VecVT, SL, Ops);
13744}
13745
13746/// Return the source of an fp_extend from f16 to f32, or a converted FP
13747/// constant.
13749 if (Src.getOpcode() == ISD::FP_EXTEND &&
13750 Src.getOperand(0).getValueType() == MVT::f16) {
13751 return Src.getOperand(0);
13752 }
13753
13754 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13755 APFloat Val = CFP->getValueAPF();
13756 bool LosesInfo = true;
13758 if (!LosesInfo)
13759 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13760 }
13761
13762 return SDValue();
13763}
13764
13765SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13766 DAGCombinerInfo &DCI) const {
13767 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13768 "combine only useful on gfx8");
13769
13770 SDValue TruncSrc = N->getOperand(0);
13771 EVT VT = N->getValueType(0);
13772 if (VT != MVT::f16)
13773 return SDValue();
13774
13775 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13776 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13777 return SDValue();
13778
13779 SelectionDAG &DAG = DCI.DAG;
13780 SDLoc SL(N);
13781
13782 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13783 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13784 // casting back.
13785
13786 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13787 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13788 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13789 if (!A)
13790 return SDValue();
13791
13792 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13793 if (!B)
13794 return SDValue();
13795
13796 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13797 if (!C)
13798 return SDValue();
13799
13800 // This changes signaling nan behavior. If an input is a signaling nan, it
13801 // would have been quieted by the fpext originally. We don't care because
13802 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13803 // we would be worse off than just doing the promotion.
13804 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13805 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13806 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13807 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13808}
13809
13810unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13811 const SDNode *N0,
13812 const SDNode *N1) const {
13813 EVT VT = N0->getValueType(0);
13814
13815 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13816 // support denormals ever.
13817 if (((VT == MVT::f32 &&
13819 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13822 return ISD::FMAD;
13823
13824 const TargetOptions &Options = DAG.getTarget().Options;
13825 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13826 (N0->getFlags().hasAllowContract() &&
13827 N1->getFlags().hasAllowContract())) &&
13829 return ISD::FMA;
13830 }
13831
13832 return 0;
13833}
13834
13835// For a reassociatable opcode perform:
13836// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13837SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13838 SelectionDAG &DAG) const {
13839 EVT VT = N->getValueType(0);
13840 if (VT != MVT::i32 && VT != MVT::i64)
13841 return SDValue();
13842
13843 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13844 return SDValue();
13845
13846 unsigned Opc = N->getOpcode();
13847 SDValue Op0 = N->getOperand(0);
13848 SDValue Op1 = N->getOperand(1);
13849
13850 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13851 return SDValue();
13852
13853 if (Op0->isDivergent())
13854 std::swap(Op0, Op1);
13855
13856 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13857 return SDValue();
13858
13859 SDValue Op2 = Op1.getOperand(1);
13860 Op1 = Op1.getOperand(0);
13861 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13862 return SDValue();
13863
13864 if (Op1->isDivergent())
13865 std::swap(Op1, Op2);
13866
13867 SDLoc SL(N);
13868 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13869 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13870}
13871
13872static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13873 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13875 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13876 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13877 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13878}
13879
13880// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13881// multiplies, if any.
13882//
13883// Full 64-bit multiplies that feed into an addition are lowered here instead
13884// of using the generic expansion. The generic expansion ends up with
13885// a tree of ADD nodes that prevents us from using the "add" part of the
13886// MAD instruction. The expansion produced here results in a chain of ADDs
13887// instead of a tree.
13888SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13889 DAGCombinerInfo &DCI) const {
13890 assert(N->getOpcode() == ISD::ADD);
13891
13892 SelectionDAG &DAG = DCI.DAG;
13893 EVT VT = N->getValueType(0);
13894 SDLoc SL(N);
13895 SDValue LHS = N->getOperand(0);
13896 SDValue RHS = N->getOperand(1);
13897
13898 if (VT.isVector())
13899 return SDValue();
13900
13901 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13902 // result in scalar registers for uniform values.
13903 if (!N->isDivergent() && Subtarget->hasSMulHi())
13904 return SDValue();
13905
13906 unsigned NumBits = VT.getScalarSizeInBits();
13907 if (NumBits <= 32 || NumBits > 64)
13908 return SDValue();
13909
13910 if (LHS.getOpcode() != ISD::MUL) {
13911 assert(RHS.getOpcode() == ISD::MUL);
13912 std::swap(LHS, RHS);
13913 }
13914
13915 // Avoid the fold if it would unduly increase the number of multiplies due to
13916 // multiple uses, except on hardware with full-rate multiply-add (which is
13917 // part of full-rate 64-bit ops).
13918 if (!Subtarget->hasFullRate64Ops()) {
13919 unsigned NumUsers = 0;
13920 for (SDNode *User : LHS->users()) {
13921 // There is a use that does not feed into addition, so the multiply can't
13922 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13923 if (User->getOpcode() != ISD::ADD)
13924 return SDValue();
13925
13926 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13927 // MUL + 3xADD + 3xADDC over 3xMAD.
13928 ++NumUsers;
13929 if (NumUsers >= 3)
13930 return SDValue();
13931 }
13932 }
13933
13934 SDValue MulLHS = LHS.getOperand(0);
13935 SDValue MulRHS = LHS.getOperand(1);
13936 SDValue AddRHS = RHS;
13937
13938 // Always check whether operands are small unsigned values, since that
13939 // knowledge is useful in more cases. Check for small signed values only if
13940 // doing so can unlock a shorter code sequence.
13941 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13942 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13943
13944 bool MulSignedLo = false;
13945 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13946 MulSignedLo =
13947 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
13948 }
13949
13950 // The operands and final result all have the same number of bits. If
13951 // operands need to be extended, they can be extended with garbage. The
13952 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13953 // truncated away in the end.
13954 if (VT != MVT::i64) {
13955 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13956 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13957 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13958 }
13959
13960 // The basic code generated is conceptually straightforward. Pseudo code:
13961 //
13962 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13963 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13964 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13965 //
13966 // The second and third lines are optional, depending on whether the factors
13967 // are {sign,zero}-extended or not.
13968 //
13969 // The actual DAG is noisier than the pseudo code, but only due to
13970 // instructions that disassemble values into low and high parts, and
13971 // assemble the final result.
13972 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13973
13974 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13975 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13976 SDValue Accum =
13977 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13978
13979 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13980 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13981
13982 if (!MulLHSUnsigned32) {
13983 auto MulLHSHi =
13984 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13985 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13986 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13987 }
13988
13989 if (!MulRHSUnsigned32) {
13990 auto MulRHSHi =
13991 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13992 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13993 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13994 }
13995
13996 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13997 Accum = DAG.getBitcast(MVT::i64, Accum);
13998 }
13999
14000 if (VT != MVT::i64)
14001 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14002 return Accum;
14003}
14004
14005SDValue
14006SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14007 DAGCombinerInfo &DCI) const {
14008 SDValue RHS = N->getOperand(1);
14009 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14010 if (!CRHS)
14011 return SDValue();
14012
14013 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14014 // common.
14015 uint64_t Val = CRHS->getZExtValue();
14016 if (countr_zero(Val) >= 32) {
14017 SelectionDAG &DAG = DCI.DAG;
14018 SDLoc SL(N);
14019 SDValue LHS = N->getOperand(0);
14020
14021 // Avoid carry machinery if we know the low half of the add does not
14022 // contribute to the final result.
14023 //
14024 // add i64:x, K if computeTrailingZeros(K) >= 32
14025 // => build_pair (add x.hi, K.hi), x.lo
14026
14027 // Breaking the 64-bit add here with this strange constant is unlikely
14028 // to interfere with addressing mode patterns.
14029
14030 SDValue Hi = getHiHalf64(LHS, DAG);
14031 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14032 SDValue AddHi =
14033 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14034
14035 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14036 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14037 }
14038
14039 return SDValue();
14040}
14041
14042// Collect the ultimate src of each of the mul node's operands, and confirm
14043// each operand is 8 bytes.
14044static std::optional<ByteProvider<SDValue>>
14045handleMulOperand(const SDValue &MulOperand) {
14046 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14047 if (!Byte0 || Byte0->isConstantZero()) {
14048 return std::nullopt;
14049 }
14050 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14051 if (Byte1 && !Byte1->isConstantZero()) {
14052 return std::nullopt;
14053 }
14054 return Byte0;
14055}
14056
14057static unsigned addPermMasks(unsigned First, unsigned Second) {
14058 unsigned FirstCs = First & 0x0c0c0c0c;
14059 unsigned SecondCs = Second & 0x0c0c0c0c;
14060 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14061 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14062
14063 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14064 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14065 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14066 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14067
14068 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14069}
14070
14071struct DotSrc {
14073 int64_t PermMask;
14075};
14076
14080 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14081
14082 assert(Src0.Src.has_value() && Src1.Src.has_value());
14083 // Src0s and Src1s are empty, just place arbitrarily.
14084 if (Step == 0) {
14085 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14086 Src0.SrcOffset / 4});
14087 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14088 Src1.SrcOffset / 4});
14089 return;
14090 }
14091
14092 for (int BPI = 0; BPI < 2; BPI++) {
14093 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14094 if (BPI == 1) {
14095 BPP = {Src1, Src0};
14096 }
14097 unsigned ZeroMask = 0x0c0c0c0c;
14098 unsigned FMask = 0xFF << (8 * (3 - Step));
14099
14100 unsigned FirstMask =
14101 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14102 unsigned SecondMask =
14103 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14104 // Attempt to find Src vector which contains our SDValue, if so, add our
14105 // perm mask to the existing one. If we are unable to find a match for the
14106 // first SDValue, attempt to find match for the second.
14107 int FirstGroup = -1;
14108 for (int I = 0; I < 2; I++) {
14109 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14110 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14111 return IterElt.SrcOp == *BPP.first.Src &&
14112 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14113 };
14114
14115 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14116 if (Match != Srcs.end()) {
14117 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14118 FirstGroup = I;
14119 break;
14120 }
14121 }
14122 if (FirstGroup != -1) {
14123 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14124 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14125 return IterElt.SrcOp == *BPP.second.Src &&
14126 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14127 };
14128 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14129 if (Match != Srcs.end()) {
14130 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14131 } else
14132 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14133 return;
14134 }
14135 }
14136
14137 // If we have made it here, then we could not find a match in Src0s or Src1s
14138 // for either Src0 or Src1, so just place them arbitrarily.
14139
14140 unsigned ZeroMask = 0x0c0c0c0c;
14141 unsigned FMask = 0xFF << (8 * (3 - Step));
14142
14143 Src0s.push_back(
14144 {*Src0.Src,
14145 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14146 Src0.SrcOffset / 4});
14147 Src1s.push_back(
14148 {*Src1.Src,
14149 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14150 Src1.SrcOffset / 4});
14151}
14152
14154 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14155 bool IsAny) {
14156
14157 // If we just have one source, just permute it accordingly.
14158 if (Srcs.size() == 1) {
14159 auto *Elt = Srcs.begin();
14160 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14161
14162 // v_perm will produce the original value
14163 if (Elt->PermMask == 0x3020100)
14164 return EltOp;
14165
14166 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14167 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14168 }
14169
14170 auto *FirstElt = Srcs.begin();
14171 auto *SecondElt = std::next(FirstElt);
14172
14174
14175 // If we have multiple sources in the chain, combine them via perms (using
14176 // calculated perm mask) and Ors.
14177 while (true) {
14178 auto FirstMask = FirstElt->PermMask;
14179 auto SecondMask = SecondElt->PermMask;
14180
14181 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14182 unsigned FirstPlusFour = FirstMask | 0x04040404;
14183 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14184 // original 0x0C.
14185 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14186
14187 auto PermMask = addPermMasks(FirstMask, SecondMask);
14188 auto FirstVal =
14189 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14190 auto SecondVal =
14191 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14192
14193 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14194 SecondVal,
14195 DAG.getConstant(PermMask, SL, MVT::i32)));
14196
14197 FirstElt = std::next(SecondElt);
14198 if (FirstElt == Srcs.end())
14199 break;
14200
14201 SecondElt = std::next(FirstElt);
14202 // If we only have a FirstElt, then just combine that into the cumulative
14203 // source node.
14204 if (SecondElt == Srcs.end()) {
14205 auto EltOp =
14206 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14207
14208 Perms.push_back(
14209 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14210 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14211 break;
14212 }
14213 }
14214
14215 assert(Perms.size() == 1 || Perms.size() == 2);
14216 return Perms.size() == 2
14217 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14218 : Perms[0];
14219}
14220
14221static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14222 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14223 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14224 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14225 EntryMask += ZeroMask;
14226 }
14227}
14228
14229static bool isMul(const SDValue Op) {
14230 auto Opcode = Op.getOpcode();
14231
14232 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14233 Opcode == AMDGPUISD::MUL_I24);
14234}
14235
14236static std::optional<bool>
14238 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14239 const SDValue &S1Op, const SelectionDAG &DAG) {
14240 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14241 // of the dot4 is irrelevant.
14242 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14243 return false;
14244
14245 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14246 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14247 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14248 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14249 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14250 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14251
14252 assert(!(S0IsUnsigned && S0IsSigned));
14253 assert(!(S1IsUnsigned && S1IsSigned));
14254
14255 // There are 9 possible permutations of
14256 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14257
14258 // In two permutations, the sign bits are known to be the same for both Ops,
14259 // so simply return Signed / Unsigned corresponding to the MSB
14260
14261 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14262 return S0IsSigned;
14263
14264 // In another two permutations, the sign bits are known to be opposite. In
14265 // this case return std::nullopt to indicate a bad match.
14266
14267 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14268 return std::nullopt;
14269
14270 // In the remaining five permutations, we don't know the value of the sign
14271 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14272 // the upper bits must be extension bits. Thus, the only ways for the sign
14273 // bit to be unknown is if it was sign extended from unknown value, or if it
14274 // was any extended. In either case, it is correct to use the signed
14275 // version of the signedness semantics of dot4
14276
14277 // In two of such permutations, we known the sign bit is set for
14278 // one op, and the other is unknown. It is okay to used signed version of
14279 // dot4.
14280 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14281 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14282 return true;
14283
14284 // In one such permutation, we don't know either of the sign bits. It is okay
14285 // to used the signed version of dot4.
14286 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14287 return true;
14288
14289 // In two of such permutations, we known the sign bit is unset for
14290 // one op, and the other is unknown. Return std::nullopt to indicate a
14291 // bad match.
14292 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14293 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14294 return std::nullopt;
14295
14296 llvm_unreachable("Fully covered condition");
14297}
14298
14299SDValue SITargetLowering::performAddCombine(SDNode *N,
14300 DAGCombinerInfo &DCI) const {
14301 SelectionDAG &DAG = DCI.DAG;
14302 EVT VT = N->getValueType(0);
14303 SDLoc SL(N);
14304 SDValue LHS = N->getOperand(0);
14305 SDValue RHS = N->getOperand(1);
14306
14307 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14308 if (Subtarget->hasMad64_32()) {
14309 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14310 return Folded;
14311 }
14312 }
14313
14314 if (SDValue V = reassociateScalarOps(N, DAG)) {
14315 return V;
14316 }
14317
14318 if (VT == MVT::i64) {
14319 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14320 return Folded;
14321 }
14322
14323 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14324 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14325 SDValue TempNode(N, 0);
14326 std::optional<bool> IsSigned;
14330
14331 // Match the v_dot4 tree, while collecting src nodes.
14332 int ChainLength = 0;
14333 for (int I = 0; I < 4; I++) {
14334 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14335 if (MulIdx == -1)
14336 break;
14337 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14338 if (!Src0)
14339 break;
14340 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14341 if (!Src1)
14342 break;
14343
14344 auto IterIsSigned = checkDot4MulSignedness(
14345 TempNode->getOperand(MulIdx), *Src0, *Src1,
14346 TempNode->getOperand(MulIdx)->getOperand(0),
14347 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14348 if (!IterIsSigned)
14349 break;
14350 if (!IsSigned)
14351 IsSigned = *IterIsSigned;
14352 if (*IterIsSigned != *IsSigned)
14353 break;
14354 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14355 auto AddIdx = 1 - MulIdx;
14356 // Allow the special case where add (add (mul24, 0), mul24) became ->
14357 // add (mul24, mul24).
14358 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14359 Src2s.push_back(TempNode->getOperand(AddIdx));
14360 auto Src0 =
14361 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14362 if (!Src0)
14363 break;
14364 auto Src1 =
14365 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14366 if (!Src1)
14367 break;
14368 auto IterIsSigned = checkDot4MulSignedness(
14369 TempNode->getOperand(AddIdx), *Src0, *Src1,
14370 TempNode->getOperand(AddIdx)->getOperand(0),
14371 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14372 if (!IterIsSigned)
14373 break;
14374 assert(IsSigned);
14375 if (*IterIsSigned != *IsSigned)
14376 break;
14377 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14378 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14379 ChainLength = I + 2;
14380 break;
14381 }
14382
14383 TempNode = TempNode->getOperand(AddIdx);
14384 Src2s.push_back(TempNode);
14385 ChainLength = I + 1;
14386 if (TempNode->getNumOperands() < 2)
14387 break;
14388 LHS = TempNode->getOperand(0);
14389 RHS = TempNode->getOperand(1);
14390 }
14391
14392 if (ChainLength < 2)
14393 return SDValue();
14394
14395 // Masks were constructed with assumption that we would find a chain of
14396 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14397 // 0x0c) so they do not affect dot calculation.
14398 if (ChainLength < 4) {
14399 fixMasks(Src0s, ChainLength);
14400 fixMasks(Src1s, ChainLength);
14401 }
14402
14403 SDValue Src0, Src1;
14404
14405 // If we are just using a single source for both, and have permuted the
14406 // bytes consistently, we can just use the sources without permuting
14407 // (commutation).
14408 bool UseOriginalSrc = false;
14409 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14410 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14411 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14412 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14413 SmallVector<unsigned, 4> SrcBytes;
14414 auto Src0Mask = Src0s.begin()->PermMask;
14415 SrcBytes.push_back(Src0Mask & 0xFF000000);
14416 bool UniqueEntries = true;
14417 for (auto I = 1; I < 4; I++) {
14418 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14419
14420 if (is_contained(SrcBytes, NextByte)) {
14421 UniqueEntries = false;
14422 break;
14423 }
14424 SrcBytes.push_back(NextByte);
14425 }
14426
14427 if (UniqueEntries) {
14428 UseOriginalSrc = true;
14429
14430 auto *FirstElt = Src0s.begin();
14431 auto FirstEltOp =
14432 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14433
14434 auto *SecondElt = Src1s.begin();
14435 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14436 SecondElt->DWordOffset);
14437
14438 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14439 MVT::getIntegerVT(32));
14440 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14441 MVT::getIntegerVT(32));
14442 }
14443 }
14444
14445 if (!UseOriginalSrc) {
14446 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14447 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14448 }
14449
14450 assert(IsSigned);
14451 SDValue Src2 =
14452 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14453
14454 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14455 : Intrinsic::amdgcn_udot4,
14456 SL, MVT::i64);
14457
14458 assert(!VT.isVector());
14459 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14460 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14461
14462 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14463 }
14464
14465 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14466 return SDValue();
14467
14468 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14469 // add x, sext (setcc) => usubo_carry x, 0, setcc
14470 unsigned Opc = LHS.getOpcode();
14471 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14472 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14473 std::swap(RHS, LHS);
14474
14475 Opc = RHS.getOpcode();
14476 switch (Opc) {
14477 default:
14478 break;
14479 case ISD::ZERO_EXTEND:
14480 case ISD::SIGN_EXTEND:
14481 case ISD::ANY_EXTEND: {
14482 auto Cond = RHS.getOperand(0);
14483 // If this won't be a real VOPC output, we would still need to insert an
14484 // extra instruction anyway.
14485 if (!isBoolSGPR(Cond))
14486 break;
14487 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14488 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14490 return DAG.getNode(Opc, SL, VTList, Args);
14491 }
14492 case ISD::UADDO_CARRY: {
14493 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14494 if (!isNullConstant(RHS.getOperand(1)))
14495 break;
14496 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14497 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14498 }
14499 }
14500 return SDValue();
14501}
14502
14503SDValue SITargetLowering::performSubCombine(SDNode *N,
14504 DAGCombinerInfo &DCI) const {
14505 SelectionDAG &DAG = DCI.DAG;
14506 EVT VT = N->getValueType(0);
14507
14508 if (VT == MVT::i64) {
14509 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14510 return Folded;
14511 }
14512
14513 if (VT != MVT::i32)
14514 return SDValue();
14515
14516 SDLoc SL(N);
14517 SDValue LHS = N->getOperand(0);
14518 SDValue RHS = N->getOperand(1);
14519
14520 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14521 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14522 unsigned Opc = RHS.getOpcode();
14523 switch (Opc) {
14524 default:
14525 break;
14526 case ISD::ZERO_EXTEND:
14527 case ISD::SIGN_EXTEND:
14528 case ISD::ANY_EXTEND: {
14529 auto Cond = RHS.getOperand(0);
14530 // If this won't be a real VOPC output, we would still need to insert an
14531 // extra instruction anyway.
14532 if (!isBoolSGPR(Cond))
14533 break;
14534 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14535 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14537 return DAG.getNode(Opc, SL, VTList, Args);
14538 }
14539 }
14540
14541 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14542 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14543 if (!isNullConstant(LHS.getOperand(1)))
14544 return SDValue();
14545 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14546 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14547 }
14548 return SDValue();
14549}
14550
14551SDValue
14552SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14553 DAGCombinerInfo &DCI) const {
14554
14555 if (N->getValueType(0) != MVT::i32)
14556 return SDValue();
14557
14558 if (!isNullConstant(N->getOperand(1)))
14559 return SDValue();
14560
14561 SelectionDAG &DAG = DCI.DAG;
14562 SDValue LHS = N->getOperand(0);
14563
14564 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14565 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14566 unsigned LHSOpc = LHS.getOpcode();
14567 unsigned Opc = N->getOpcode();
14568 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14569 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14570 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14571 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14572 }
14573 return SDValue();
14574}
14575
14576SDValue SITargetLowering::performFAddCombine(SDNode *N,
14577 DAGCombinerInfo &DCI) const {
14578 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14579 return SDValue();
14580
14581 SelectionDAG &DAG = DCI.DAG;
14582 EVT VT = N->getValueType(0);
14583
14584 SDLoc SL(N);
14585 SDValue LHS = N->getOperand(0);
14586 SDValue RHS = N->getOperand(1);
14587
14588 // These should really be instruction patterns, but writing patterns with
14589 // source modifiers is a pain.
14590
14591 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14592 if (LHS.getOpcode() == ISD::FADD) {
14593 SDValue A = LHS.getOperand(0);
14594 if (A == LHS.getOperand(1)) {
14595 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14596 if (FusedOp != 0) {
14597 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14598 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14599 }
14600 }
14601 }
14602
14603 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14604 if (RHS.getOpcode() == ISD::FADD) {
14605 SDValue A = RHS.getOperand(0);
14606 if (A == RHS.getOperand(1)) {
14607 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14608 if (FusedOp != 0) {
14609 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14610 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14611 }
14612 }
14613 }
14614
14615 return SDValue();
14616}
14617
14618SDValue SITargetLowering::performFSubCombine(SDNode *N,
14619 DAGCombinerInfo &DCI) const {
14620 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14621 return SDValue();
14622
14623 SelectionDAG &DAG = DCI.DAG;
14624 SDLoc SL(N);
14625 EVT VT = N->getValueType(0);
14626 assert(!VT.isVector());
14627
14628 // Try to get the fneg to fold into the source modifier. This undoes generic
14629 // DAG combines and folds them into the mad.
14630 //
14631 // Only do this if we are not trying to support denormals. v_mad_f32 does
14632 // not support denormals ever.
14633 SDValue LHS = N->getOperand(0);
14634 SDValue RHS = N->getOperand(1);
14635 if (LHS.getOpcode() == ISD::FADD) {
14636 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14637 SDValue A = LHS.getOperand(0);
14638 if (A == LHS.getOperand(1)) {
14639 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14640 if (FusedOp != 0) {
14641 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14642 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14643
14644 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14645 }
14646 }
14647 }
14648
14649 if (RHS.getOpcode() == ISD::FADD) {
14650 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14651
14652 SDValue A = RHS.getOperand(0);
14653 if (A == RHS.getOperand(1)) {
14654 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14655 if (FusedOp != 0) {
14656 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14657 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14658 }
14659 }
14660 }
14661
14662 return SDValue();
14663}
14664
14665SDValue SITargetLowering::performFDivCombine(SDNode *N,
14666 DAGCombinerInfo &DCI) const {
14667 SelectionDAG &DAG = DCI.DAG;
14668 SDLoc SL(N);
14669 EVT VT = N->getValueType(0);
14670 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14671 return SDValue();
14672
14673 SDValue LHS = N->getOperand(0);
14674 SDValue RHS = N->getOperand(1);
14675
14676 SDNodeFlags Flags = N->getFlags();
14677 SDNodeFlags RHSFlags = RHS->getFlags();
14678 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14679 !RHS->hasOneUse())
14680 return SDValue();
14681
14682 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14683 bool IsNegative = false;
14684 if (CLHS->isExactlyValue(1.0) ||
14685 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14686 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14687 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14688 if (RHS.getOpcode() == ISD::FSQRT) {
14689 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14690 SDValue Rsq =
14691 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14692 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14693 }
14694 }
14695 }
14696
14697 return SDValue();
14698}
14699
14700SDValue SITargetLowering::performFMulCombine(SDNode *N,
14701 DAGCombinerInfo &DCI) const {
14702 SelectionDAG &DAG = DCI.DAG;
14703 EVT VT = N->getValueType(0);
14704 EVT ScalarVT = VT.getScalarType();
14705 EVT IntVT = VT.changeElementType(MVT::i32);
14706
14707 SDValue LHS = N->getOperand(0);
14708 SDValue RHS = N->getOperand(1);
14709
14710 // It is cheaper to realize i32 inline constants as compared against
14711 // materializing f16 or f64 (or even non-inline f32) values,
14712 // possible via ldexp usage, as shown below :
14713 //
14714 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14715 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14716 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14717 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14718 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14719 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14720 if (!TrueNode)
14721 return SDValue();
14722 const ConstantFPSDNode *FalseNode =
14723 isConstOrConstSplatFP(RHS.getOperand(2));
14724 if (!FalseNode)
14725 return SDValue();
14726
14727 if (TrueNode->isNegative() != FalseNode->isNegative())
14728 return SDValue();
14729
14730 // For f32, only non-inline constants should be transformed.
14732 if (ScalarVT == MVT::f32 &&
14733 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14734 TII->isInlineConstant(FalseNode->getValueAPF()))
14735 return SDValue();
14736
14737 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14738 if (TrueNodeExpVal == INT_MIN)
14739 return SDValue();
14740 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14741 if (FalseNodeExpVal == INT_MIN)
14742 return SDValue();
14743
14744 SDLoc SL(N);
14745 SDValue SelectNode =
14746 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14747 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14748 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14749
14750 LHS = TrueNode->isNegative()
14751 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14752 : LHS;
14753
14754 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14755 }
14756
14757 return SDValue();
14758}
14759
14760SDValue SITargetLowering::performFMACombine(SDNode *N,
14761 DAGCombinerInfo &DCI) const {
14762 SelectionDAG &DAG = DCI.DAG;
14763 EVT VT = N->getValueType(0);
14764 SDLoc SL(N);
14765
14766 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14767 return SDValue();
14768
14769 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14770 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14771 SDValue Op1 = N->getOperand(0);
14772 SDValue Op2 = N->getOperand(1);
14773 SDValue FMA = N->getOperand(2);
14774
14775 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14776 Op2.getOpcode() != ISD::FP_EXTEND)
14777 return SDValue();
14778
14779 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14780 // regardless of the denorm mode setting. Therefore,
14781 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14782 const TargetOptions &Options = DAG.getTarget().Options;
14783 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14784 (N->getFlags().hasAllowContract() &&
14785 FMA->getFlags().hasAllowContract())) {
14786 Op1 = Op1.getOperand(0);
14787 Op2 = Op2.getOperand(0);
14788 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14790 return SDValue();
14791
14792 SDValue Vec1 = Op1.getOperand(0);
14793 SDValue Idx1 = Op1.getOperand(1);
14794 SDValue Vec2 = Op2.getOperand(0);
14795
14796 SDValue FMAOp1 = FMA.getOperand(0);
14797 SDValue FMAOp2 = FMA.getOperand(1);
14798 SDValue FMAAcc = FMA.getOperand(2);
14799
14800 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14801 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14802 return SDValue();
14803
14804 FMAOp1 = FMAOp1.getOperand(0);
14805 FMAOp2 = FMAOp2.getOperand(0);
14806 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14808 return SDValue();
14809
14810 SDValue Vec3 = FMAOp1.getOperand(0);
14811 SDValue Vec4 = FMAOp2.getOperand(0);
14812 SDValue Idx2 = FMAOp1.getOperand(1);
14813
14814 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14815 // Idx1 and Idx2 cannot be the same.
14816 Idx1 == Idx2)
14817 return SDValue();
14818
14819 if (Vec1 == Vec2 || Vec3 == Vec4)
14820 return SDValue();
14821
14822 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14823 return SDValue();
14824
14825 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14826 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14827 DAG.getTargetConstant(0, SL, MVT::i1));
14828 }
14829 }
14830 return SDValue();
14831}
14832
14833SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14834 DAGCombinerInfo &DCI) const {
14835 SelectionDAG &DAG = DCI.DAG;
14836 SDLoc SL(N);
14837
14838 SDValue LHS = N->getOperand(0);
14839 SDValue RHS = N->getOperand(1);
14840 EVT VT = LHS.getValueType();
14841 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14842
14843 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14844 if (!CRHS) {
14845 CRHS = dyn_cast<ConstantSDNode>(LHS);
14846 if (CRHS) {
14847 std::swap(LHS, RHS);
14849 }
14850 }
14851
14852 if (CRHS) {
14853 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14854 isBoolSGPR(LHS.getOperand(0))) {
14855 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14856 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14857 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14858 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14859 if ((CRHS->isAllOnes() &&
14860 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14861 (CRHS->isZero() &&
14862 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14863 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14864 DAG.getAllOnesConstant(SL, MVT::i1));
14865 if ((CRHS->isAllOnes() &&
14866 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14867 (CRHS->isZero() &&
14868 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14869 return LHS.getOperand(0);
14870 }
14871
14872 const APInt &CRHSVal = CRHS->getAPIntValue();
14873 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14874 LHS.getOpcode() == ISD::SELECT &&
14875 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14876 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14877 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14878 isBoolSGPR(LHS.getOperand(0))) {
14879 // Given CT != FT:
14880 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14881 // setcc (select cc, CT, CF), CF, ne => cc
14882 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14883 // setcc (select cc, CT, CF), CT, eq => cc
14884 const APInt &CT = LHS.getConstantOperandAPInt(1);
14885 const APInt &CF = LHS.getConstantOperandAPInt(2);
14886
14887 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14888 (CT == CRHSVal && CC == ISD::SETNE))
14889 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14890 DAG.getAllOnesConstant(SL, MVT::i1));
14891 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14892 (CT == CRHSVal && CC == ISD::SETEQ))
14893 return LHS.getOperand(0);
14894 }
14895 }
14896
14897 if (VT != MVT::f32 && VT != MVT::f64 &&
14898 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14899 return SDValue();
14900
14901 // Match isinf/isfinite pattern
14902 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14903 // (fcmp one (fabs x), inf) -> (fp_class x,
14904 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14905 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
14906 LHS.getOpcode() == ISD::FABS) {
14907 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14908 if (!CRHS)
14909 return SDValue();
14910
14911 const APFloat &APF = CRHS->getValueAPF();
14912 if (APF.isInfinity() && !APF.isNegative()) {
14913 const unsigned IsInfMask =
14915 const unsigned IsFiniteMask =
14919 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14920 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14921 DAG.getConstant(Mask, SL, MVT::i32));
14922 }
14923 }
14924
14925 return SDValue();
14926}
14927
14928SDValue
14929SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14930 DAGCombinerInfo &DCI) const {
14931 SelectionDAG &DAG = DCI.DAG;
14932 SDLoc SL(N);
14933 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14934
14935 SDValue Src = N->getOperand(0);
14936 SDValue Shift = N->getOperand(0);
14937
14938 // TODO: Extend type shouldn't matter (assuming legal types).
14939 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14940 Shift = Shift.getOperand(0);
14941
14942 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14943 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14944 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14945 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14946 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14947 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14948 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14949 SDValue Shifted = DAG.getZExtOrTrunc(
14950 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
14951
14952 unsigned ShiftOffset = 8 * Offset;
14953 if (Shift.getOpcode() == ISD::SHL)
14954 ShiftOffset -= C->getZExtValue();
14955 else
14956 ShiftOffset += C->getZExtValue();
14957
14958 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14959 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14960 MVT::f32, Shifted);
14961 }
14962 }
14963 }
14964
14965 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14966 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14967 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14968 // We simplified Src. If this node is not dead, visit it again so it is
14969 // folded properly.
14970 if (N->getOpcode() != ISD::DELETED_NODE)
14971 DCI.AddToWorklist(N);
14972 return SDValue(N, 0);
14973 }
14974
14975 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14976 if (SDValue DemandedSrc =
14978 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14979
14980 return SDValue();
14981}
14982
14983SDValue SITargetLowering::performClampCombine(SDNode *N,
14984 DAGCombinerInfo &DCI) const {
14985 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14986 if (!CSrc)
14987 return SDValue();
14988
14989 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14990 const APFloat &F = CSrc->getValueAPF();
14991 APFloat Zero = APFloat::getZero(F.getSemantics());
14992 if (F < Zero ||
14993 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14994 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14995 }
14996
14997 APFloat One(F.getSemantics(), "1.0");
14998 if (F > One)
14999 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15000
15001 return SDValue(CSrc, 0);
15002}
15003
15005 DAGCombinerInfo &DCI) const {
15006 switch (N->getOpcode()) {
15007 case ISD::ADD:
15008 case ISD::SUB:
15009 case ISD::SHL:
15010 case ISD::SRL:
15011 case ISD::SRA:
15012 case ISD::AND:
15013 case ISD::OR:
15014 case ISD::XOR:
15015 case ISD::MUL:
15016 case ISD::SETCC:
15017 case ISD::SELECT:
15018 case ISD::SMIN:
15019 case ISD::SMAX:
15020 case ISD::UMIN:
15021 case ISD::UMAX:
15022 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15023 return Res;
15024 break;
15025 default:
15026 break;
15027 }
15028
15029 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15030 return SDValue();
15031
15032 switch (N->getOpcode()) {
15033 case ISD::ADD:
15034 return performAddCombine(N, DCI);
15035 case ISD::SUB:
15036 return performSubCombine(N, DCI);
15037 case ISD::UADDO_CARRY:
15038 case ISD::USUBO_CARRY:
15039 return performAddCarrySubCarryCombine(N, DCI);
15040 case ISD::FADD:
15041 return performFAddCombine(N, DCI);
15042 case ISD::FSUB:
15043 return performFSubCombine(N, DCI);
15044 case ISD::FDIV:
15045 return performFDivCombine(N, DCI);
15046 case ISD::FMUL:
15047 return performFMulCombine(N, DCI);
15048 case ISD::SETCC:
15049 return performSetCCCombine(N, DCI);
15050 case ISD::FMAXNUM:
15051 case ISD::FMINNUM:
15052 case ISD::FMAXNUM_IEEE:
15053 case ISD::FMINNUM_IEEE:
15054 case ISD::FMAXIMUM:
15055 case ISD::FMINIMUM:
15056 case ISD::SMAX:
15057 case ISD::SMIN:
15058 case ISD::UMAX:
15059 case ISD::UMIN:
15062 return performMinMaxCombine(N, DCI);
15063 case ISD::FMA:
15064 return performFMACombine(N, DCI);
15065 case ISD::AND:
15066 return performAndCombine(N, DCI);
15067 case ISD::OR:
15068 return performOrCombine(N, DCI);
15069 case ISD::FSHR: {
15071 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15072 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15073 return matchPERM(N, DCI);
15074 }
15075 break;
15076 }
15077 case ISD::XOR:
15078 return performXorCombine(N, DCI);
15079 case ISD::ZERO_EXTEND:
15080 return performZeroExtendCombine(N, DCI);
15082 return performSignExtendInRegCombine(N, DCI);
15084 return performClassCombine(N, DCI);
15085 case ISD::FCANONICALIZE:
15086 return performFCanonicalizeCombine(N, DCI);
15087 case AMDGPUISD::RCP:
15088 return performRcpCombine(N, DCI);
15089 case ISD::FLDEXP:
15090 case AMDGPUISD::FRACT:
15091 case AMDGPUISD::RSQ:
15094 case AMDGPUISD::RSQ_CLAMP: {
15095 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15096 SDValue Src = N->getOperand(0);
15097 if (Src.isUndef())
15098 return Src;
15099 break;
15100 }
15101 case ISD::SINT_TO_FP:
15102 case ISD::UINT_TO_FP:
15103 return performUCharToFloatCombine(N, DCI);
15104 case ISD::FCOPYSIGN:
15105 return performFCopySignCombine(N, DCI);
15110 return performCvtF32UByteNCombine(N, DCI);
15111 case AMDGPUISD::FMED3:
15112 return performFMed3Combine(N, DCI);
15114 return performCvtPkRTZCombine(N, DCI);
15115 case AMDGPUISD::CLAMP:
15116 return performClampCombine(N, DCI);
15117 case ISD::SCALAR_TO_VECTOR: {
15118 SelectionDAG &DAG = DCI.DAG;
15119 EVT VT = N->getValueType(0);
15120
15121 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15122 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15123 SDLoc SL(N);
15124 SDValue Src = N->getOperand(0);
15125 EVT EltVT = Src.getValueType();
15126 if (EltVT != MVT::i16)
15127 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15128
15129 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15130 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15131 }
15132
15133 break;
15134 }
15136 return performExtractVectorEltCombine(N, DCI);
15138 return performInsertVectorEltCombine(N, DCI);
15139 case ISD::FP_ROUND:
15140 return performFPRoundCombine(N, DCI);
15141 case ISD::LOAD: {
15142 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15143 return Widened;
15144 [[fallthrough]];
15145 }
15146 default: {
15147 if (!DCI.isBeforeLegalize()) {
15148 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15149 return performMemSDNodeCombine(MemNode, DCI);
15150 }
15151
15152 break;
15153 }
15154 }
15155
15157}
15158
15159/// Helper function for adjustWritemask
15160static unsigned SubIdx2Lane(unsigned Idx) {
15161 switch (Idx) {
15162 default:
15163 return ~0u;
15164 case AMDGPU::sub0:
15165 return 0;
15166 case AMDGPU::sub1:
15167 return 1;
15168 case AMDGPU::sub2:
15169 return 2;
15170 case AMDGPU::sub3:
15171 return 3;
15172 case AMDGPU::sub4:
15173 return 4; // Possible with TFE/LWE
15174 }
15175}
15176
15177/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15178SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15179 SelectionDAG &DAG) const {
15180 unsigned Opcode = Node->getMachineOpcode();
15181
15182 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15183 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15184 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15185 return Node; // not implemented for D16
15186
15187 SDNode *Users[5] = {nullptr};
15188 unsigned Lane = 0;
15189 unsigned DmaskIdx =
15190 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15191 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15192 unsigned NewDmask = 0;
15193 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15194 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15195 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15196 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15197 ? true
15198 : false;
15199 unsigned TFCLane = 0;
15200 bool HasChain = Node->getNumValues() > 1;
15201
15202 if (OldDmask == 0) {
15203 // These are folded out, but on the chance it happens don't assert.
15204 return Node;
15205 }
15206
15207 unsigned OldBitsSet = llvm::popcount(OldDmask);
15208 // Work out which is the TFE/LWE lane if that is enabled.
15209 if (UsesTFC) {
15210 TFCLane = OldBitsSet;
15211 }
15212
15213 // Try to figure out the used register components
15214 for (SDUse &Use : Node->uses()) {
15215
15216 // Don't look at users of the chain.
15217 if (Use.getResNo() != 0)
15218 continue;
15219
15220 SDNode *User = Use.getUser();
15221
15222 // Abort if we can't understand the usage
15223 if (!User->isMachineOpcode() ||
15224 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15225 return Node;
15226
15227 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15228 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15229 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15230 // set, etc.
15231 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15232 if (Lane == ~0u)
15233 return Node;
15234
15235 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15236 if (UsesTFC && Lane == TFCLane) {
15237 Users[Lane] = User;
15238 } else {
15239 // Set which texture component corresponds to the lane.
15240 unsigned Comp;
15241 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15242 Comp = llvm::countr_zero(Dmask);
15243 Dmask &= ~(1 << Comp);
15244 }
15245
15246 // Abort if we have more than one user per component.
15247 if (Users[Lane])
15248 return Node;
15249
15250 Users[Lane] = User;
15251 NewDmask |= 1 << Comp;
15252 }
15253 }
15254
15255 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15256 bool NoChannels = !NewDmask;
15257 if (NoChannels) {
15258 if (!UsesTFC) {
15259 // No uses of the result and not using TFC. Then do nothing.
15260 return Node;
15261 }
15262 // If the original dmask has one channel - then nothing to do
15263 if (OldBitsSet == 1)
15264 return Node;
15265 // Use an arbitrary dmask - required for the instruction to work
15266 NewDmask = 1;
15267 }
15268 // Abort if there's no change
15269 if (NewDmask == OldDmask)
15270 return Node;
15271
15272 unsigned BitsSet = llvm::popcount(NewDmask);
15273
15274 // Check for TFE or LWE - increase the number of channels by one to account
15275 // for the extra return value
15276 // This will need adjustment for D16 if this is also included in
15277 // adjustWriteMask (this function) but at present D16 are excluded.
15278 unsigned NewChannels = BitsSet + UsesTFC;
15279
15280 int NewOpcode =
15281 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15282 assert(NewOpcode != -1 &&
15283 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15284 "failed to find equivalent MIMG op");
15285
15286 // Adjust the writemask in the node
15288 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15289 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15290 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15291
15292 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15293
15294 MVT ResultVT = NewChannels == 1
15295 ? SVT
15296 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15297 : NewChannels == 5 ? 8
15298 : NewChannels);
15299 SDVTList NewVTList =
15300 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15301
15302 MachineSDNode *NewNode =
15303 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15304
15305 if (HasChain) {
15306 // Update chain.
15307 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15308 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15309 }
15310
15311 if (NewChannels == 1) {
15312 assert(Node->hasNUsesOfValue(1, 0));
15313 SDNode *Copy =
15314 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15315 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15316 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15317 return nullptr;
15318 }
15319
15320 // Update the users of the node with the new indices
15321 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15322 SDNode *User = Users[i];
15323 if (!User) {
15324 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15325 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15326 if (i || !NoChannels)
15327 continue;
15328 } else {
15329 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15330 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15331 if (NewUser != User) {
15332 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15333 DAG.RemoveDeadNode(User);
15334 }
15335 }
15336
15337 switch (Idx) {
15338 default:
15339 break;
15340 case AMDGPU::sub0:
15341 Idx = AMDGPU::sub1;
15342 break;
15343 case AMDGPU::sub1:
15344 Idx = AMDGPU::sub2;
15345 break;
15346 case AMDGPU::sub2:
15347 Idx = AMDGPU::sub3;
15348 break;
15349 case AMDGPU::sub3:
15350 Idx = AMDGPU::sub4;
15351 break;
15352 }
15353 }
15354
15355 DAG.RemoveDeadNode(Node);
15356 return nullptr;
15357}
15358
15360 if (Op.getOpcode() == ISD::AssertZext)
15361 Op = Op.getOperand(0);
15362
15363 return isa<FrameIndexSDNode>(Op);
15364}
15365
15366/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15367/// with frame index operands.
15368/// LLVM assumes that inputs are to these instructions are registers.
15369SDNode *
15371 SelectionDAG &DAG) const {
15372 if (Node->getOpcode() == ISD::CopyToReg) {
15373 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15374 SDValue SrcVal = Node->getOperand(2);
15375
15376 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15377 // to try understanding copies to physical registers.
15378 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15379 SDLoc SL(Node);
15381 SDValue VReg = DAG.getRegister(
15382 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15383
15384 SDNode *Glued = Node->getGluedNode();
15385 SDValue ToVReg = DAG.getCopyToReg(
15386 Node->getOperand(0), SL, VReg, SrcVal,
15387 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15388 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15389 VReg, ToVReg.getValue(1));
15390 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15391 DAG.RemoveDeadNode(Node);
15392 return ToResultReg.getNode();
15393 }
15394 }
15395
15397 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15398 if (!isFrameIndexOp(Node->getOperand(i))) {
15399 Ops.push_back(Node->getOperand(i));
15400 continue;
15401 }
15402
15403 SDLoc DL(Node);
15404 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15405 Node->getOperand(i).getValueType(),
15406 Node->getOperand(i)),
15407 0));
15408 }
15409
15410 return DAG.UpdateNodeOperands(Node, Ops);
15411}
15412
15413/// Fold the instructions after selecting them.
15414/// Returns null if users were already updated.
15416 SelectionDAG &DAG) const {
15418 unsigned Opcode = Node->getMachineOpcode();
15419
15420 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15421 !TII->isGather4(Opcode) &&
15422 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15423 return adjustWritemask(Node, DAG);
15424 }
15425
15426 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15428 return Node;
15429 }
15430
15431 switch (Opcode) {
15432 case AMDGPU::V_DIV_SCALE_F32_e64:
15433 case AMDGPU::V_DIV_SCALE_F64_e64: {
15434 // Satisfy the operand register constraint when one of the inputs is
15435 // undefined. Ordinarily each undef value will have its own implicit_def of
15436 // a vreg, so force these to use a single register.
15437 SDValue Src0 = Node->getOperand(1);
15438 SDValue Src1 = Node->getOperand(3);
15439 SDValue Src2 = Node->getOperand(5);
15440
15441 if ((Src0.isMachineOpcode() &&
15442 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15443 (Src0 == Src1 || Src0 == Src2))
15444 break;
15445
15446 MVT VT = Src0.getValueType().getSimpleVT();
15447 const TargetRegisterClass *RC =
15448 getRegClassFor(VT, Src0.getNode()->isDivergent());
15449
15451 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15452
15453 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15454 Src0, SDValue());
15455
15456 // src0 must be the same register as src1 or src2, even if the value is
15457 // undefined, so make sure we don't violate this constraint.
15458 if (Src0.isMachineOpcode() &&
15459 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15460 if (Src1.isMachineOpcode() &&
15461 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15462 Src0 = Src1;
15463 else if (Src2.isMachineOpcode() &&
15464 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15465 Src0 = Src2;
15466 else {
15467 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15468 Src0 = UndefReg;
15469 Src1 = UndefReg;
15470 }
15471 } else
15472 break;
15473
15474 SmallVector<SDValue, 9> Ops(Node->ops());
15475 Ops[1] = Src0;
15476 Ops[3] = Src1;
15477 Ops[5] = Src2;
15478 Ops.push_back(ImpDef.getValue(1));
15479 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15480 }
15481 default:
15482 break;
15483 }
15484
15485 return Node;
15486}
15487
15488// Any MIMG instructions that use tfe or lwe require an initialization of the
15489// result register that will be written in the case of a memory access failure.
15490// The required code is also added to tie this init code to the result of the
15491// img instruction.
15494 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15495 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15496 MachineBasicBlock &MBB = *MI.getParent();
15497
15498 int DstIdx =
15499 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15500 unsigned InitIdx = 0;
15501
15502 if (TII->isImage(MI)) {
15503 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15504 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15505 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15506
15507 if (!TFE && !LWE) // intersect_ray
15508 return;
15509
15510 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15511 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15512 unsigned D16Val = D16 ? D16->getImm() : 0;
15513
15514 if (!TFEVal && !LWEVal)
15515 return;
15516
15517 // At least one of TFE or LWE are non-zero
15518 // We have to insert a suitable initialization of the result value and
15519 // tie this to the dest of the image instruction.
15520
15521 // Calculate which dword we have to initialize to 0.
15522 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15523
15524 // check that dmask operand is found.
15525 assert(MO_Dmask && "Expected dmask operand in instruction");
15526
15527 unsigned dmask = MO_Dmask->getImm();
15528 // Determine the number of active lanes taking into account the
15529 // Gather4 special case
15530 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15531
15532 bool Packed = !Subtarget->hasUnpackedD16VMem();
15533
15534 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15535
15536 // Abandon attempt if the dst size isn't large enough
15537 // - this is in fact an error but this is picked up elsewhere and
15538 // reported correctly.
15539 uint32_t DstSize =
15540 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15541 if (DstSize < InitIdx)
15542 return;
15543 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15544 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15545 } else {
15546 return;
15547 }
15548
15549 const DebugLoc &DL = MI.getDebugLoc();
15550
15551 // Create a register for the initialization value.
15552 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15553 unsigned NewDst = 0; // Final initialized value will be in here
15554
15555 // If PRTStrictNull feature is enabled (the default) then initialize
15556 // all the result registers to 0, otherwise just the error indication
15557 // register (VGPRn+1)
15558 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15559 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15560
15561 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15562 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15563 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15564 // Initialize dword
15565 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15566 // clang-format off
15567 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15568 .addImm(0);
15569 // clang-format on
15570 // Insert into the super-reg
15571 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15572 .addReg(PrevDst)
15573 .addReg(SubReg)
15575
15576 PrevDst = NewDst;
15577 }
15578
15579 // Add as an implicit operand
15580 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15581
15582 // Tie the just added implicit operand to the dst
15583 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15584}
15585
15586/// Assign the register class depending on the number of
15587/// bits set in the writemask
15589 SDNode *Node) const {
15591
15592 MachineFunction *MF = MI.getParent()->getParent();
15595
15596 if (TII->isVOP3(MI.getOpcode())) {
15597 // Make sure constant bus requirements are respected.
15598 TII->legalizeOperandsVOP3(MRI, MI);
15599
15600 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15601 // This saves a chain-copy of registers and better balance register
15602 // use between vgpr and agpr as agpr tuples tend to be big.
15603 if (!MI.getDesc().operands().empty()) {
15604 unsigned Opc = MI.getOpcode();
15605 bool HasAGPRs = Info->mayNeedAGPRs();
15606 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15607 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15608 for (auto I :
15609 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15610 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15611 if (I == -1)
15612 break;
15613 if ((I == Src2Idx) && (HasAGPRs))
15614 break;
15615 MachineOperand &Op = MI.getOperand(I);
15616 if (!Op.isReg() || !Op.getReg().isVirtual())
15617 continue;
15618 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15619 if (!TRI->hasAGPRs(RC))
15620 continue;
15621 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15622 if (!Src || !Src->isCopy() ||
15623 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15624 continue;
15625 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15626 // All uses of agpr64 and agpr32 can also accept vgpr except for
15627 // v_accvgpr_read, but we do not produce agpr reads during selection,
15628 // so no use checks are needed.
15629 MRI.setRegClass(Op.getReg(), NewRC);
15630 }
15631
15632 if (TII->isMAI(MI)) {
15633 // The ordinary src0, src1, src2 were legalized above.
15634 //
15635 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15636 // as a separate instruction.
15637 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15638 AMDGPU::OpName::scale_src0);
15639 if (Src0Idx != -1) {
15640 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15641 AMDGPU::OpName::scale_src1);
15642 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15643 TII->usesConstantBus(MRI, MI, Src1Idx))
15644 TII->legalizeOpWithMove(MI, Src1Idx);
15645 }
15646 }
15647
15648 if (!HasAGPRs)
15649 return;
15650
15651 // Resolve the rest of AV operands to AGPRs.
15652 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15653 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15654 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15655 if (TRI->isVectorSuperClass(RC)) {
15656 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15657 MRI.setRegClass(Src2->getReg(), NewRC);
15658 if (Src2->isTied())
15659 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15660 }
15661 }
15662 }
15663 }
15664
15665 return;
15666 }
15667
15668 if (TII->isImage(MI))
15669 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15670}
15671
15673 uint64_t Val) {
15674 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15675 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15676}
15677
15679 const SDLoc &DL,
15680 SDValue Ptr) const {
15682
15683 // Build the half of the subregister with the constants before building the
15684 // full 128-bit register. If we are building multiple resource descriptors,
15685 // this will allow CSEing of the 2-component register.
15686 const SDValue Ops0[] = {
15687 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15688 buildSMovImm32(DAG, DL, 0),
15689 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15690 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15691 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15692
15693 SDValue SubRegHi = SDValue(
15694 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15695
15696 // Combine the constants and the pointer.
15697 const SDValue Ops1[] = {
15698 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15699 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15700 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15701
15702 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15703}
15704
15705/// Return a resource descriptor with the 'Add TID' bit enabled
15706/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15707/// of the resource descriptor) to create an offset, which is added to
15708/// the resource pointer.
15710 SDValue Ptr, uint32_t RsrcDword1,
15711 uint64_t RsrcDword2And3) const {
15712 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15713 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15714 if (RsrcDword1) {
15715 PtrHi =
15716 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15717 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15718 0);
15719 }
15720
15721 SDValue DataLo =
15722 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15723 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15724
15725 const SDValue Ops[] = {
15726 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15727 PtrLo,
15728 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15729 PtrHi,
15730 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15731 DataLo,
15732 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15733 DataHi,
15734 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15735
15736 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15737}
15738
15739//===----------------------------------------------------------------------===//
15740// SI Inline Assembly Support
15741//===----------------------------------------------------------------------===//
15742
15743std::pair<unsigned, const TargetRegisterClass *>
15745 StringRef Constraint,
15746 MVT VT) const {
15747 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15748
15749 const TargetRegisterClass *RC = nullptr;
15750 if (Constraint.size() == 1) {
15751 const unsigned BitWidth = VT.getSizeInBits();
15752 switch (Constraint[0]) {
15753 default:
15754 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15755 case 's':
15756 case 'r':
15757 switch (BitWidth) {
15758 case 16:
15759 RC = &AMDGPU::SReg_32RegClass;
15760 break;
15761 case 64:
15762 RC = &AMDGPU::SGPR_64RegClass;
15763 break;
15764 default:
15766 if (!RC)
15767 return std::pair(0U, nullptr);
15768 break;
15769 }
15770 break;
15771 case 'v':
15772 switch (BitWidth) {
15773 case 16:
15774 RC = &AMDGPU::VGPR_32RegClass;
15775 break;
15776 default:
15777 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15778 if (!RC)
15779 return std::pair(0U, nullptr);
15780 break;
15781 }
15782 break;
15783 case 'a':
15784 if (!Subtarget->hasMAIInsts())
15785 break;
15786 switch (BitWidth) {
15787 case 16:
15788 RC = &AMDGPU::AGPR_32RegClass;
15789 break;
15790 default:
15791 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15792 if (!RC)
15793 return std::pair(0U, nullptr);
15794 break;
15795 }
15796 break;
15797 }
15798 // We actually support i128, i16 and f16 as inline parameters
15799 // even if they are not reported as legal
15800 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15801 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15802 return std::pair(0U, RC);
15803 }
15804
15805 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15806 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15807 if (RegName.consume_front("v")) {
15808 RC = &AMDGPU::VGPR_32RegClass;
15809 } else if (RegName.consume_front("s")) {
15810 RC = &AMDGPU::SGPR_32RegClass;
15811 } else if (RegName.consume_front("a")) {
15812 RC = &AMDGPU::AGPR_32RegClass;
15813 }
15814
15815 if (RC) {
15816 uint32_t Idx;
15817 if (RegName.consume_front("[")) {
15818 uint32_t End;
15819 bool Failed = RegName.consumeInteger(10, Idx);
15820 Failed |= !RegName.consume_front(":");
15821 Failed |= RegName.consumeInteger(10, End);
15822 Failed |= !RegName.consume_back("]");
15823 if (!Failed) {
15824 uint32_t Width = (End - Idx + 1) * 32;
15825 // Prohibit constraints for register ranges with a width that does not
15826 // match the required type.
15827 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15828 return std::pair(0U, nullptr);
15829 MCRegister Reg = RC->getRegister(Idx);
15831 RC = TRI->getVGPRClassForBitWidth(Width);
15832 else if (SIRegisterInfo::isSGPRClass(RC))
15833 RC = TRI->getSGPRClassForBitWidth(Width);
15834 else if (SIRegisterInfo::isAGPRClass(RC))
15835 RC = TRI->getAGPRClassForBitWidth(Width);
15836 if (RC) {
15837 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15838 return std::pair(Reg, RC);
15839 }
15840 }
15841 } else {
15842 // Check for lossy scalar/vector conversions.
15843 if (VT.isVector() && VT.getSizeInBits() != 32)
15844 return std::pair(0U, nullptr);
15845 bool Failed = RegName.getAsInteger(10, Idx);
15846 if (!Failed && Idx < RC->getNumRegs())
15847 return std::pair(RC->getRegister(Idx), RC);
15848 }
15849 }
15850 }
15851
15852 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15853 if (Ret.first)
15854 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15855
15856 return Ret;
15857}
15858
15859static bool isImmConstraint(StringRef Constraint) {
15860 if (Constraint.size() == 1) {
15861 switch (Constraint[0]) {
15862 default:
15863 break;
15864 case 'I':
15865 case 'J':
15866 case 'A':
15867 case 'B':
15868 case 'C':
15869 return true;
15870 }
15871 } else if (Constraint == "DA" || Constraint == "DB") {
15872 return true;
15873 }
15874 return false;
15875}
15876
15879 if (Constraint.size() == 1) {
15880 switch (Constraint[0]) {
15881 default:
15882 break;
15883 case 's':
15884 case 'v':
15885 case 'a':
15886 return C_RegisterClass;
15887 }
15888 }
15889 if (isImmConstraint(Constraint)) {
15890 return C_Other;
15891 }
15892 return TargetLowering::getConstraintType(Constraint);
15893}
15894
15895static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15897 Val = Val & maskTrailingOnes<uint64_t>(Size);
15898 }
15899 return Val;
15900}
15901
15903 StringRef Constraint,
15904 std::vector<SDValue> &Ops,
15905 SelectionDAG &DAG) const {
15906 if (isImmConstraint(Constraint)) {
15907 uint64_t Val;
15908 if (getAsmOperandConstVal(Op, Val) &&
15909 checkAsmConstraintVal(Op, Constraint, Val)) {
15910 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15911 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15912 }
15913 } else {
15914 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15915 }
15916}
15917
15919 unsigned Size = Op.getScalarValueSizeInBits();
15920 if (Size > 64)
15921 return false;
15922
15923 if (Size == 16 && !Subtarget->has16BitInsts())
15924 return false;
15925
15926 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15927 Val = C->getSExtValue();
15928 return true;
15929 }
15930 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15931 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15932 return true;
15933 }
15934 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15935 if (Size != 16 || Op.getNumOperands() != 2)
15936 return false;
15937 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15938 return false;
15939 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15940 Val = C->getSExtValue();
15941 return true;
15942 }
15943 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15944 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15945 return true;
15946 }
15947 }
15948
15949 return false;
15950}
15951
15953 uint64_t Val) const {
15954 if (Constraint.size() == 1) {
15955 switch (Constraint[0]) {
15956 case 'I':
15958 case 'J':
15959 return isInt<16>(Val);
15960 case 'A':
15961 return checkAsmConstraintValA(Op, Val);
15962 case 'B':
15963 return isInt<32>(Val);
15964 case 'C':
15965 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15967 default:
15968 break;
15969 }
15970 } else if (Constraint.size() == 2) {
15971 if (Constraint == "DA") {
15972 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15973 int64_t LoBits = static_cast<int32_t>(Val);
15974 return checkAsmConstraintValA(Op, HiBits, 32) &&
15975 checkAsmConstraintValA(Op, LoBits, 32);
15976 }
15977 if (Constraint == "DB") {
15978 return true;
15979 }
15980 }
15981 llvm_unreachable("Invalid asm constraint");
15982}
15983
15985 unsigned MaxSize) const {
15986 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15987 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15988 if (Size == 16) {
15989 MVT VT = Op.getSimpleValueType();
15990 switch (VT.SimpleTy) {
15991 default:
15992 return false;
15993 case MVT::i16:
15994 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15995 case MVT::f16:
15996 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15997 case MVT::bf16:
15998 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15999 case MVT::v2i16:
16000 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16001 case MVT::v2f16:
16002 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16003 case MVT::v2bf16:
16004 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16005 }
16006 }
16007 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16008 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16009 return true;
16010 return false;
16011}
16012
16013static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16014 switch (UnalignedClassID) {
16015 case AMDGPU::VReg_64RegClassID:
16016 return AMDGPU::VReg_64_Align2RegClassID;
16017 case AMDGPU::VReg_96RegClassID:
16018 return AMDGPU::VReg_96_Align2RegClassID;
16019 case AMDGPU::VReg_128RegClassID:
16020 return AMDGPU::VReg_128_Align2RegClassID;
16021 case AMDGPU::VReg_160RegClassID:
16022 return AMDGPU::VReg_160_Align2RegClassID;
16023 case AMDGPU::VReg_192RegClassID:
16024 return AMDGPU::VReg_192_Align2RegClassID;
16025 case AMDGPU::VReg_224RegClassID:
16026 return AMDGPU::VReg_224_Align2RegClassID;
16027 case AMDGPU::VReg_256RegClassID:
16028 return AMDGPU::VReg_256_Align2RegClassID;
16029 case AMDGPU::VReg_288RegClassID:
16030 return AMDGPU::VReg_288_Align2RegClassID;
16031 case AMDGPU::VReg_320RegClassID:
16032 return AMDGPU::VReg_320_Align2RegClassID;
16033 case AMDGPU::VReg_352RegClassID:
16034 return AMDGPU::VReg_352_Align2RegClassID;
16035 case AMDGPU::VReg_384RegClassID:
16036 return AMDGPU::VReg_384_Align2RegClassID;
16037 case AMDGPU::VReg_512RegClassID:
16038 return AMDGPU::VReg_512_Align2RegClassID;
16039 case AMDGPU::VReg_1024RegClassID:
16040 return AMDGPU::VReg_1024_Align2RegClassID;
16041 case AMDGPU::AReg_64RegClassID:
16042 return AMDGPU::AReg_64_Align2RegClassID;
16043 case AMDGPU::AReg_96RegClassID:
16044 return AMDGPU::AReg_96_Align2RegClassID;
16045 case AMDGPU::AReg_128RegClassID:
16046 return AMDGPU::AReg_128_Align2RegClassID;
16047 case AMDGPU::AReg_160RegClassID:
16048 return AMDGPU::AReg_160_Align2RegClassID;
16049 case AMDGPU::AReg_192RegClassID:
16050 return AMDGPU::AReg_192_Align2RegClassID;
16051 case AMDGPU::AReg_256RegClassID:
16052 return AMDGPU::AReg_256_Align2RegClassID;
16053 case AMDGPU::AReg_512RegClassID:
16054 return AMDGPU::AReg_512_Align2RegClassID;
16055 case AMDGPU::AReg_1024RegClassID:
16056 return AMDGPU::AReg_1024_Align2RegClassID;
16057 default:
16058 return -1;
16059 }
16060}
16061
16062// Figure out which registers should be reserved for stack access. Only after
16063// the function is legalized do we know all of the non-spill stack objects or if
16064// calls are present.
16068 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16069 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16070 const SIInstrInfo *TII = ST.getInstrInfo();
16071
16072 if (Info->isEntryFunction()) {
16073 // Callable functions have fixed registers used for stack access.
16075 }
16076
16077 // TODO: Move this logic to getReservedRegs()
16078 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16079 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16080 Register SReg = ST.isWave32()
16081 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16082 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16083 &AMDGPU::SGPR_64RegClass);
16084 Info->setSGPRForEXECCopy(SReg);
16085
16086 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16087 Info->getStackPtrOffsetReg()));
16088 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16089 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16090
16091 // We need to worry about replacing the default register with itself in case
16092 // of MIR testcases missing the MFI.
16093 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16094 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16095
16096 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16097 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16098
16099 Info->limitOccupancy(MF);
16100
16101 if (ST.isWave32() && !MF.empty()) {
16102 for (auto &MBB : MF) {
16103 for (auto &MI : MBB) {
16104 TII->fixImplicitOperands(MI);
16105 }
16106 }
16107 }
16108
16109 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16110 // classes if required. Ideally the register class constraints would differ
16111 // per-subtarget, but there's no easy way to achieve that right now. This is
16112 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16113 // from using them as the register class for legal types.
16114 if (ST.needsAlignedVGPRs()) {
16115 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16116 const Register Reg = Register::index2VirtReg(I);
16117 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16118 if (!RC)
16119 continue;
16120 int NewClassID = getAlignedAGPRClassID(RC->getID());
16121 if (NewClassID != -1)
16122 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16123 }
16124 }
16125
16127}
16128
16130 KnownBits &Known,
16131 const APInt &DemandedElts,
16132 const SelectionDAG &DAG,
16133 unsigned Depth) const {
16134 Known.resetAll();
16135 unsigned Opc = Op.getOpcode();
16136 switch (Opc) {
16138 unsigned IID = Op.getConstantOperandVal(0);
16139 switch (IID) {
16140 case Intrinsic::amdgcn_mbcnt_lo:
16141 case Intrinsic::amdgcn_mbcnt_hi: {
16142 const GCNSubtarget &ST =
16144 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16145 // most 31 + src1.
16146 Known.Zero.setBitsFrom(
16147 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16148 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16149 Known = KnownBits::add(Known, Known2);
16150 return;
16151 }
16152 }
16153 break;
16154 }
16155 }
16157 Op, Known, DemandedElts, DAG, Depth);
16158}
16159
16161 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16163
16164 // Set the high bits to zero based on the maximum allowed scratch size per
16165 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16166 // calculation won't overflow, so assume the sign bit is never set.
16167 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16168}
16169
16171 KnownBits &Known, unsigned Dim) {
16172 unsigned MaxValue =
16173 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16174 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16175}
16176
16178 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16179 const MachineRegisterInfo &MRI, unsigned Depth) const {
16180 const MachineInstr *MI = MRI.getVRegDef(R);
16181 switch (MI->getOpcode()) {
16182 case AMDGPU::G_INTRINSIC:
16183 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16184 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16185 switch (IID) {
16186 case Intrinsic::amdgcn_workitem_id_x:
16187 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16188 break;
16189 case Intrinsic::amdgcn_workitem_id_y:
16190 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16191 break;
16192 case Intrinsic::amdgcn_workitem_id_z:
16193 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16194 break;
16195 case Intrinsic::amdgcn_mbcnt_lo:
16196 case Intrinsic::amdgcn_mbcnt_hi: {
16197 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16198 // most 31 + src1.
16199 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16200 ? getSubtarget()->getWavefrontSizeLog2()
16201 : 5);
16202 KnownBits Known2;
16203 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16204 Depth + 1);
16205 Known = KnownBits::add(Known, Known2);
16206 break;
16207 }
16208 case Intrinsic::amdgcn_groupstaticsize: {
16209 // We can report everything over the maximum size as 0. We can't report
16210 // based on the actual size because we don't know if it's accurate or not
16211 // at any given point.
16212 Known.Zero.setHighBits(
16213 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16214 break;
16215 }
16216 }
16217 break;
16218 }
16219 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16220 Known.Zero.setHighBits(24);
16221 break;
16222 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16223 Known.Zero.setHighBits(16);
16224 break;
16225 case AMDGPU::G_AMDGPU_SMED3:
16226 case AMDGPU::G_AMDGPU_UMED3: {
16227 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16228
16229 KnownBits Known2;
16230 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16231 if (Known2.isUnknown())
16232 break;
16233
16234 KnownBits Known1;
16235 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16236 if (Known1.isUnknown())
16237 break;
16238
16239 KnownBits Known0;
16240 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16241 if (Known0.isUnknown())
16242 break;
16243
16244 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16245 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16246 Known.One = Known0.One & Known1.One & Known2.One;
16247 break;
16248 }
16249 }
16250}
16251
16254 unsigned Depth) const {
16255 const MachineInstr *MI = MRI.getVRegDef(R);
16256 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16257 // FIXME: Can this move to generic code? What about the case where the call
16258 // site specifies a lower alignment?
16259 Intrinsic::ID IID = GI->getIntrinsicID();
16261 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16262 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16263 return *RetAlign;
16264 }
16265 return Align(1);
16266}
16267
16270 const Align CacheLineAlign = Align(64);
16271
16272 // Pre-GFX10 target did not benefit from loop alignment
16273 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16274 getSubtarget()->hasInstFwdPrefetchBug())
16275 return PrefAlign;
16276
16277 // On GFX10 I$ is 4 x 64 bytes cache lines.
16278 // By default prefetcher keeps one cache line behind and reads two ahead.
16279 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16280 // behind and one ahead.
16281 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16282 // If loop fits 64 bytes it always spans no more than two cache lines and
16283 // does not need an alignment.
16284 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16285 // Else if loop is less or equal 192 bytes we need two lines behind.
16286
16288 const MachineBasicBlock *Header = ML->getHeader();
16289 if (Header->getAlignment() != PrefAlign)
16290 return Header->getAlignment(); // Already processed.
16291
16292 unsigned LoopSize = 0;
16293 for (const MachineBasicBlock *MBB : ML->blocks()) {
16294 // If inner loop block is aligned assume in average half of the alignment
16295 // size to be added as nops.
16296 if (MBB != Header)
16297 LoopSize += MBB->getAlignment().value() / 2;
16298
16299 for (const MachineInstr &MI : *MBB) {
16300 LoopSize += TII->getInstSizeInBytes(MI);
16301 if (LoopSize > 192)
16302 return PrefAlign;
16303 }
16304 }
16305
16306 if (LoopSize <= 64)
16307 return PrefAlign;
16308
16309 if (LoopSize <= 128)
16310 return CacheLineAlign;
16311
16312 // If any of parent loops is surrounded by prefetch instructions do not
16313 // insert new for inner loop, which would reset parent's settings.
16314 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16315 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16316 auto I = Exit->getFirstNonDebugInstr();
16317 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16318 return CacheLineAlign;
16319 }
16320 }
16321
16322 MachineBasicBlock *Pre = ML->getLoopPreheader();
16323 MachineBasicBlock *Exit = ML->getExitBlock();
16324
16325 if (Pre && Exit) {
16326 auto PreTerm = Pre->getFirstTerminator();
16327 if (PreTerm == Pre->begin() ||
16328 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16329 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16330 .addImm(1); // prefetch 2 lines behind PC
16331
16332 auto ExitHead = Exit->getFirstNonDebugInstr();
16333 if (ExitHead == Exit->end() ||
16334 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16335 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16336 .addImm(2); // prefetch 1 line behind PC
16337 }
16338
16339 return CacheLineAlign;
16340}
16341
16343static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16344 assert(N->getOpcode() == ISD::CopyFromReg);
16345 do {
16346 // Follow the chain until we find an INLINEASM node.
16347 N = N->getOperand(0).getNode();
16348 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16349 return true;
16350 } while (N->getOpcode() == ISD::CopyFromReg);
16351 return false;
16352}
16353
16356 UniformityInfo *UA) const {
16357 switch (N->getOpcode()) {
16358 case ISD::CopyFromReg: {
16359 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16360 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16361 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16362 Register Reg = R->getReg();
16363
16364 // FIXME: Why does this need to consider isLiveIn?
16365 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16366 return !TRI->isSGPRReg(MRI, Reg);
16367
16368 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16369 return UA->isDivergent(V);
16370
16371 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16372 return !TRI->isSGPRReg(MRI, Reg);
16373 }
16374 case ISD::LOAD: {
16375 const LoadSDNode *L = cast<LoadSDNode>(N);
16376 unsigned AS = L->getAddressSpace();
16377 // A flat load may access private memory.
16379 }
16380 case ISD::CALLSEQ_END:
16381 return true;
16383 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16385 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16404 // Target-specific read-modify-write atomics are sources of divergence.
16405 return true;
16406 default:
16407 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16408 // Generic read-modify-write atomics are sources of divergence.
16409 return A->readMem() && A->writeMem();
16410 }
16411 return false;
16412 }
16413}
16414
16416 EVT VT) const {
16417 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16418 case MVT::f32:
16420 case MVT::f64:
16421 case MVT::f16:
16423 default:
16424 return false;
16425 }
16426}
16427
16429 LLT Ty, const MachineFunction &MF) const {
16430 switch (Ty.getScalarSizeInBits()) {
16431 case 32:
16432 return !denormalModeIsFlushAllF32(MF);
16433 case 64:
16434 case 16:
16435 return !denormalModeIsFlushAllF64F16(MF);
16436 default:
16437 return false;
16438 }
16439}
16440
16442 const SelectionDAG &DAG,
16443 bool SNaN,
16444 unsigned Depth) const {
16445 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16446 const MachineFunction &MF = DAG.getMachineFunction();
16448
16449 if (Info->getMode().DX10Clamp)
16450 return true; // Clamped to 0.
16451 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16452 }
16453
16455 Depth);
16456}
16457
16458// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16459// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16461 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16462 return true;
16463
16465 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16466 if (DenormMode == DenormalMode::getPreserveSign())
16467 return true;
16468
16469 // TODO: Remove this.
16470 return RMW->getFunction()
16471 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16472 .getValueAsBool();
16473}
16474
16476 LLVMContext &Ctx = RMW->getContext();
16477 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16478 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16479
16480 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16481 << "Hardware instruction generated for atomic "
16482 << RMW->getOperationName(RMW->getOperation())
16483 << " operation at memory scope " << MemScope;
16484}
16485
16486static bool isV2F16OrV2BF16(Type *Ty) {
16487 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16488 Type *EltTy = VT->getElementType();
16489 return VT->getNumElements() == 2 &&
16490 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16491 }
16492
16493 return false;
16494}
16495
16496static bool isV2F16(Type *Ty) {
16497 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16498 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16499}
16500
16501static bool isV2BF16(Type *Ty) {
16502 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16503 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16504}
16505
16506/// \return true if atomicrmw integer ops work for the type.
16507static bool isAtomicRMWLegalIntTy(Type *Ty) {
16508 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16509 unsigned BW = IT->getBitWidth();
16510 return BW == 32 || BW == 64;
16511 }
16512
16513 return false;
16514}
16515
16516/// \return true if this atomicrmw xchg type can be selected.
16517static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16518 Type *Ty = RMW->getType();
16519 if (isAtomicRMWLegalIntTy(Ty))
16520 return true;
16521
16522 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16523 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16524 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16525 return BW == 32 || BW == 64;
16526 }
16527
16528 if (Ty->isFloatTy() || Ty->isDoubleTy())
16529 return true;
16530
16531 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16532 return VT->getNumElements() == 2 &&
16533 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16534 }
16535
16536 return false;
16537}
16538
16539/// \returns true if it's valid to emit a native instruction for \p RMW, based
16540/// on the properties of the target memory.
16541static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16542 const AtomicRMWInst *RMW,
16543 bool HasSystemScope) {
16544 // The remote/fine-grained access logic is different from the integer
16545 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16546 // fine-grained access does not work, even for a device local allocation.
16547 //
16548 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16549 // allocations work.
16550 if (HasSystemScope) {
16552 RMW->hasMetadata("amdgpu.no.remote.memory"))
16553 return true;
16555 return true;
16556
16557 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16558}
16559
16560/// \return Action to perform on AtomicRMWInsts for integer operations.
16563 return isAtomicRMWLegalIntTy(RMW->getType())
16566}
16567
16568/// Return if a flat address space atomicrmw can access private memory.
16570 const MDNode *NoaliasAddrSpaceMD =
16571 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16572 if (!NoaliasAddrSpaceMD)
16573 return true;
16574
16575 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16576 ++I) {
16577 auto *Low = mdconst::extract<ConstantInt>(
16578 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16579 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16580 auto *High = mdconst::extract<ConstantInt>(
16581 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16582 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16583 }
16584 }
16585
16586 return true;
16587}
16588
16591 unsigned AS = RMW->getPointerAddressSpace();
16592 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16594
16595 // 64-bit flat atomics that dynamically reside in private memory will silently
16596 // be dropped.
16597 //
16598 // Note that we will emit a new copy of the original atomic in the expansion,
16599 // which will be incrementally relegalized.
16600 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16601 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16602 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16605
16606 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16608 ORE.emit([=]() {
16609 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16610 });
16611 return Kind;
16612 };
16613
16614 auto SSID = RMW->getSyncScopeID();
16615 bool HasSystemScope =
16616 SSID == SyncScope::System ||
16617 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16618
16619 auto Op = RMW->getOperation();
16620 switch (Op) {
16621 case AtomicRMWInst::Xchg: {
16622 // PCIe supports add and xchg for system atomics.
16623 return isAtomicRMWLegalXChgTy(RMW)
16626 }
16627 case AtomicRMWInst::Add:
16628 case AtomicRMWInst::And:
16632 case AtomicRMWInst::Sub:
16633 case AtomicRMWInst::Or:
16634 case AtomicRMWInst::Xor: {
16635 // Atomic sub/or/xor do not work over PCI express, but atomic add
16636 // does. InstCombine transforms these with 0 to or, so undo that.
16637 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16638 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16639 ConstVal && ConstVal->isNullValue())
16641 }
16642
16644 }
16645 case AtomicRMWInst::FAdd: {
16646 Type *Ty = RMW->getType();
16647
16648 // TODO: Handle REGION_ADDRESS
16649 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16650 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16651 // is fixed to round-to-nearest-even.
16652 //
16653 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16654 // round-to-nearest-even.
16655 //
16656 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16657 // suggests it is OK if the floating-point mode may not match the calling
16658 // thread.
16659 if (Ty->isFloatTy()) {
16662 }
16663
16664 if (Ty->isDoubleTy()) {
16665 // Ignores denormal mode, but we don't consider flushing mandatory.
16668 }
16669
16670 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16672
16674 }
16675
16676 // LDS atomics respect the denormal mode from the mode register.
16677 //
16678 // Traditionally f32 global/buffer memory atomics would unconditionally
16679 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16680 // flush.
16681 //
16682 // On targets with flat atomic fadd, denormals would flush depending on
16683 // whether the target address resides in LDS or global memory. We consider
16684 // this flat-maybe-flush as will-flush.
16685 if (Ty->isFloatTy() &&
16689
16690 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16691 // safe. The message phrasing also should be better.
16692 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16693 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16694 // gfx940, gfx12
16695 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16696 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16697 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16698 // gfx90a, gfx940, gfx12
16699 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16700 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16701
16702 // gfx940, gfx12
16703 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16704 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16705 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16706 // gfx90a, gfx940, gfx12
16707 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16708 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16709
16710 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16711 // buffer. gfx12 does have the buffer version.
16712 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16713 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16714 }
16715
16716 // global and flat atomic fadd f64: gfx90a, gfx940.
16717 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16718 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16719
16720 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16721 if (Ty->isFloatTy()) {
16722 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16723 // gfx11+.
16724 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16725 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16726 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16727 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16728 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16729 } else {
16730 // gfx908
16731 if (RMW->use_empty() &&
16733 isV2F16(Ty))
16734 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16735 }
16736 }
16737
16738 // flat atomic fadd f32: gfx940, gfx11+.
16739 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16740 if (Subtarget->hasFlatAtomicFaddF32Inst())
16741 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16742
16743 // If it is in flat address space, and the type is float, we will try to
16744 // expand it, if the target supports global and lds atomic fadd. The
16745 // reason we need that is, in the expansion, we emit the check of
16746 // address space. If it is in global address space, we emit the global
16747 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16748 // fadd.
16749 if (Subtarget->hasLDSFPAtomicAddF32()) {
16750 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16752 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16754 }
16755 }
16756 }
16757
16759 }
16761 case AtomicRMWInst::FMax: {
16762 Type *Ty = RMW->getType();
16763
16764 // LDS float and double fmin/fmax were always supported.
16765 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16766 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16768 }
16769
16770 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16771 // For flat and global cases:
16772 // float, double in gfx7. Manual claims denormal support.
16773 // Removed in gfx8.
16774 // float, double restored in gfx10.
16775 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16776 //
16777 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16778 // no f32.
16779 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16780 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16781 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16782 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16783 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16784 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16786 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16787 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16788 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16789 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16790 }
16791 }
16792
16794 }
16795 case AtomicRMWInst::Min:
16796 case AtomicRMWInst::Max:
16798 case AtomicRMWInst::UMax: {
16801 // Always expand system scope min/max atomics.
16802 if (HasSystemScope)
16804 }
16805
16807 }
16810 default:
16812 }
16813
16814 llvm_unreachable("covered atomicrmw op switch");
16815}
16816
16822}
16823
16826 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16829}
16830
16833 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16834 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16836
16837 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16839
16840 const DataLayout &DL = CmpX->getDataLayout();
16841
16842 Type *ValTy = CmpX->getNewValOperand()->getType();
16843
16844 // If a 64-bit flat atomic may alias private, we need to avoid using the
16845 // atomic in the private case.
16846 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16848}
16849
16850const TargetRegisterClass *
16851SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16853 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16854 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16855 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16856 : &AMDGPU::SReg_32RegClass;
16857 if (!TRI->isSGPRClass(RC) && !isDivergent)
16858 return TRI->getEquivalentSGPRClass(RC);
16859 if (TRI->isSGPRClass(RC) && isDivergent)
16860 return TRI->getEquivalentVGPRClass(RC);
16861
16862 return RC;
16863}
16864
16865// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16866// uniform values (as produced by the mask results of control flow intrinsics)
16867// used outside of divergent blocks. The phi users need to also be treated as
16868// always uniform.
16869//
16870// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16871static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16872 unsigned WaveSize) {
16873 // FIXME: We assume we never cast the mask results of a control flow
16874 // intrinsic.
16875 // Early exit if the type won't be consistent as a compile time hack.
16876 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16877 if (!IT || IT->getBitWidth() != WaveSize)
16878 return false;
16879
16880 if (!isa<Instruction>(V))
16881 return false;
16882 if (!Visited.insert(V).second)
16883 return false;
16884 bool Result = false;
16885 for (const auto *U : V->users()) {
16886 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16887 if (V == U->getOperand(1)) {
16888 switch (Intrinsic->getIntrinsicID()) {
16889 default:
16890 Result = false;
16891 break;
16892 case Intrinsic::amdgcn_if_break:
16893 case Intrinsic::amdgcn_if:
16894 case Intrinsic::amdgcn_else:
16895 Result = true;
16896 break;
16897 }
16898 }
16899 if (V == U->getOperand(0)) {
16900 switch (Intrinsic->getIntrinsicID()) {
16901 default:
16902 Result = false;
16903 break;
16904 case Intrinsic::amdgcn_end_cf:
16905 case Intrinsic::amdgcn_loop:
16906 Result = true;
16907 break;
16908 }
16909 }
16910 } else {
16911 Result = hasCFUser(U, Visited, WaveSize);
16912 }
16913 if (Result)
16914 break;
16915 }
16916 return Result;
16917}
16918
16920 const Value *V) const {
16921 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16922 if (CI->isInlineAsm()) {
16923 // FIXME: This cannot give a correct answer. This should only trigger in
16924 // the case where inline asm returns mixed SGPR and VGPR results, used
16925 // outside the defining block. We don't have a specific result to
16926 // consider, so this assumes if any value is SGPR, the overall register
16927 // also needs to be SGPR.
16928 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16930 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16931 for (auto &TC : TargetConstraints) {
16932 if (TC.Type == InlineAsm::isOutput) {
16934 const TargetRegisterClass *RC =
16935 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
16936 TC.ConstraintVT)
16937 .second;
16938 if (RC && SIRI->isSGPRClass(RC))
16939 return true;
16940 }
16941 }
16942 }
16943 }
16945 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16946}
16947
16949 for (SDUse &Use : N->uses()) {
16950 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
16951 if (getBasePtrIndex(M) == Use.getOperandNo())
16952 return true;
16953 }
16954 }
16955 return false;
16956}
16957
16959 SDValue N1) const {
16960 if (!N0.hasOneUse())
16961 return false;
16962 // Take care of the opportunity to keep N0 uniform
16963 if (N0->isDivergent() || !N1->isDivergent())
16964 return true;
16965 // Check if we have a good chance to form the memory access pattern with the
16966 // base and offset
16967 return (DAG.isBaseWithConstantOffset(N0) &&
16969}
16970
16972 Register N0, Register N1) const {
16973 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16974}
16975
16978 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16980 if (I.getMetadata("amdgpu.noclobber"))
16981 Flags |= MONoClobber;
16982 if (I.getMetadata("amdgpu.last.use"))
16983 Flags |= MOLastUse;
16984 return Flags;
16985}
16986
16988 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16989 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16990 if (User->getOpcode() != ISD::CopyToReg)
16991 return false;
16992 if (!Def->isMachineOpcode())
16993 return false;
16994 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16995 if (!MDef)
16996 return false;
16997
16998 unsigned ResNo = User->getOperand(Op).getResNo();
16999 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17000 return false;
17001 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17002 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17003 PhysReg = AMDGPU::SCC;
17004 const TargetRegisterClass *RC =
17005 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17006 Cost = RC->getCopyCost();
17007 return true;
17008 }
17009 return false;
17010}
17011
17012/// Check if it is profitable to hoist instruction in then/else to if.
17014 if (!I->hasOneUse())
17015 return true;
17016
17017 Instruction *User = I->user_back();
17018 // TODO: Add more patterns that are not profitable to hoist and
17019 // handle modifiers such as fabs and fneg
17020 switch (I->getOpcode()) {
17021 case Instruction::FMul: {
17022 if (User->getOpcode() != Instruction::FSub &&
17023 User->getOpcode() != Instruction::FAdd)
17024 return true;
17025
17027
17028 return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17029 Options.AllowFPOpFusion != FPOpFusion::Fast &&
17030 !Options.UnsafeFPMath) ||
17031 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17032 }
17033 default:
17034 return true;
17035 }
17036 return true;
17037}
17038
17040 Instruction *AI) const {
17041 // Given: atomicrmw fadd ptr %addr, float %val ordering
17042 //
17043 // With this expansion we produce the following code:
17044 // [...]
17045 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17046 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17047 //
17048 // atomicrmw.shared:
17049 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17050 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17051 // float %val ordering
17052 // br label %atomicrmw.phi
17053 //
17054 // atomicrmw.check.private:
17055 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17056 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17057 //
17058 // atomicrmw.private:
17059 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17060 // %loaded.private = load float, ptr addrspace(5) %cast.private
17061 // %val.new = fadd float %loaded.private, %val
17062 // store float %val.new, ptr addrspace(5) %cast.private
17063 // br label %atomicrmw.phi
17064 //
17065 // atomicrmw.global:
17066 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17067 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17068 // float %val ordering
17069 // br label %atomicrmw.phi
17070 //
17071 // atomicrmw.phi:
17072 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17073 // [ %loaded.private, %atomicrmw.private ],
17074 // [ %loaded.global, %atomicrmw.global ]
17075 // br label %atomicrmw.end
17076 //
17077 // atomicrmw.end:
17078 // [...]
17079 //
17080 //
17081 // For 64-bit atomics which may reside in private memory, we perform a simpler
17082 // version that only inserts the private check, and uses the flat operation.
17083
17084 IRBuilder<> Builder(AI);
17085 LLVMContext &Ctx = Builder.getContext();
17086
17087 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17088 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17090 Value *Addr = AI->getOperand(PtrOpIdx);
17091
17092 /// TODO: Only need to check private, then emit flat-known-not private (no
17093 /// need for shared block, or cast to global).
17094 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17095
17096 Align Alignment;
17097 if (RMW)
17098 Alignment = RMW->getAlign();
17099 else if (CX)
17100 Alignment = CX->getAlign();
17101 else
17102 llvm_unreachable("unhandled atomic operation");
17103
17104 // FullFlatEmulation is true if we need to issue the private, shared, and
17105 // global cases.
17106 //
17107 // If this is false, we are only dealing with the flat-targeting-private case,
17108 // where we only insert a check for private and still use the flat instruction
17109 // for global and shared.
17110
17111 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17112 Subtarget->hasAtomicFaddInsts() &&
17113 RMW->getType()->isFloatTy();
17114
17115 // If the return value isn't used, do not introduce a false use in the phi.
17116 bool ReturnValueIsUsed = !AI->use_empty();
17117
17118 BasicBlock *BB = Builder.GetInsertBlock();
17119 Function *F = BB->getParent();
17120 BasicBlock *ExitBB =
17121 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17122 BasicBlock *SharedBB = nullptr;
17123
17124 BasicBlock *CheckPrivateBB = BB;
17125 if (FullFlatEmulation) {
17126 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17127 CheckPrivateBB =
17128 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17129 }
17130
17131 BasicBlock *PrivateBB =
17132 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17133 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17134 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17135
17136 std::prev(BB->end())->eraseFromParent();
17137 Builder.SetInsertPoint(BB);
17138
17139 Value *LoadedShared = nullptr;
17140 if (FullFlatEmulation) {
17141 CallInst *IsShared = Builder.CreateIntrinsic(
17142 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17143 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17144 Builder.SetInsertPoint(SharedBB);
17145 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17147
17148 Instruction *Clone = AI->clone();
17149 Clone->insertInto(SharedBB, SharedBB->end());
17150 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17151 LoadedShared = Clone;
17152
17153 Builder.CreateBr(PhiBB);
17154 Builder.SetInsertPoint(CheckPrivateBB);
17155 }
17156
17157 CallInst *IsPrivate = Builder.CreateIntrinsic(
17158 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17159 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17160
17161 Builder.SetInsertPoint(PrivateBB);
17162
17163 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17165
17166 Value *LoadedPrivate;
17167 if (RMW) {
17168 LoadedPrivate = Builder.CreateAlignedLoad(
17169 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17170
17171 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17172 LoadedPrivate, RMW->getValOperand());
17173
17174 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17175 } else {
17176 auto [ResultLoad, Equal] =
17177 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17178 CX->getNewValOperand(), CX->getAlign());
17179
17180 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17181 ResultLoad, 0);
17182 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17183 }
17184
17185 Builder.CreateBr(PhiBB);
17186
17187 Builder.SetInsertPoint(GlobalBB);
17188
17189 // Continue using a flat instruction if we only emitted the check for private.
17190 Instruction *LoadedGlobal = AI;
17191 if (FullFlatEmulation) {
17192 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17194 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17195 }
17196
17197 AI->removeFromParent();
17198 AI->insertInto(GlobalBB, GlobalBB->end());
17199
17200 // The new atomicrmw may go through another round of legalization later.
17201 if (!FullFlatEmulation) {
17202 // We inserted the runtime check already, make sure we do not try to
17203 // re-expand this.
17204 // TODO: Should union with any existing metadata.
17205 MDBuilder MDB(F->getContext());
17206 MDNode *RangeNotPrivate =
17209 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17210 RangeNotPrivate);
17211 }
17212
17213 Builder.CreateBr(PhiBB);
17214
17215 Builder.SetInsertPoint(PhiBB);
17216
17217 if (ReturnValueIsUsed) {
17218 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17219 AI->replaceAllUsesWith(Loaded);
17220 if (FullFlatEmulation)
17221 Loaded->addIncoming(LoadedShared, SharedBB);
17222 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17223 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17224 Loaded->takeName(AI);
17225 }
17226
17227 Builder.CreateBr(ExitBB);
17228}
17229
17232
17235 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17236 ConstVal && ConstVal->isNullValue()) {
17237 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17239
17240 // We may still need the private-alias-flat handling below.
17241
17242 // TODO: Skip this for cases where we cannot access remote memory.
17243 }
17244 }
17245
17246 // The non-flat expansions should only perform the de-canonicalization of
17247 // identity values.
17249 return;
17250
17252}
17253
17256}
17257
17258LoadInst *
17260 IRBuilder<> Builder(AI);
17261 auto Order = AI->getOrdering();
17262
17263 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17264 // must be flushed if the atomic ordering had a release semantics. This is
17265 // not necessary a fence, a release fence just coincides to do that flush.
17266 // Avoid replacing of an atomicrmw with a release semantics.
17267 if (isReleaseOrStronger(Order))
17268 return nullptr;
17269
17270 LoadInst *LI = Builder.CreateAlignedLoad(
17271 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17272 LI->setAtomic(Order, AI->getSyncScopeID());
17273 LI->copyMetadata(*AI);
17274 LI->takeName(AI);
17275 AI->replaceAllUsesWith(LI);
17276 AI->eraseFromParent();
17277 return LI;
17278}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1122
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
bool isNegative() const
Definition: APFloat.h:1445
APInt bitcastToAPInt() const
Definition: APFloat.h:1351
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
bool isInfinity() const
Definition: APFloat.h:1442
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1451
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2561
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1814
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2434
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1163
LLVMContext & getContext() const
Definition: IRBuilder.h:195
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1157
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1833
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2156
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:750
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:982
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
const Pass * getPass() const
Definition: SelectionDAG.h:493
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:856
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:827
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:712
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:873
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1490
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1639
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1606
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1586
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1609
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:235
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals