LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426 Custom);
427
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
433 Custom);
434
435 // Deal with vec3 vector operations when widened to vec4.
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
438
439 // Deal with vec5/6/7 vector operations when widened to vec8.
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
450
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
454 Expand);
455
456 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
457
458 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
459
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
461 // illegal.
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
464
465 // On SI this is s_memtime and s_memrealtime on VI.
467
468 if (Subtarget->hasSMemRealTime() ||
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
632 case ISD::IS_FPCLASS:
633 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
744 Subtarget->hasVOP3PInsts() ? Legal : Custom);
745
746 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
750
753
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 } else {
859 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
860 if (Subtarget->hasMinimum3Maximum3F32())
862
863 if (Subtarget->hasMinimum3Maximum3PKF16())
865 }
866
868 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
869 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
870 MVT::i8},
871 Custom);
872
874 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
875 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
876 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
877 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Custom);
879
881 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
882 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
883 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
884 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
885 Custom);
886
892
893 // TODO: Could move this to custom lowering, could benefit from combines on
894 // extract of relevant bits.
896
898
899 if (Subtarget->hasBF16ConversionInsts()) {
903 }
904
905 if (Subtarget->hasCvtPkF16F32Inst()) {
907 }
908
911 ISD::SUB,
913 ISD::MUL,
914 ISD::FADD,
915 ISD::FSUB,
916 ISD::FDIV,
917 ISD::FMUL,
924 ISD::FMA,
925 ISD::SMIN,
926 ISD::SMAX,
927 ISD::UMIN,
928 ISD::UMAX,
931 ISD::SMIN,
932 ISD::SMAX,
933 ISD::UMIN,
934 ISD::UMAX,
935 ISD::AND,
936 ISD::OR,
937 ISD::XOR,
938 ISD::SHL,
939 ISD::SRL,
940 ISD::SRA,
941 ISD::FSHR,
951
952 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
954
955 // All memory operations. Some folding on the pointer operand is done to help
956 // matching the constant offsets in the addressing modes.
981
982 // FIXME: In other contexts we pretend this is a per-function property.
984
986}
987
988const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
989
991 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
992 return RCRegs;
993}
994
995//===----------------------------------------------------------------------===//
996// TargetLowering queries
997//===----------------------------------------------------------------------===//
998
999// v_mad_mix* support a conversion from f16 to f32.
1000//
1001// There is only one special case when denormals are enabled we don't currently,
1002// where this is OK to use.
1003bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1004 EVT DestVT, EVT SrcVT) const {
1005 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1006 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1007 DestVT.getScalarType() == MVT::f32 &&
1008 SrcVT.getScalarType() == MVT::f16 &&
1009 // TODO: This probably only requires no input flushing?
1011}
1012
1014 LLT DestTy, LLT SrcTy) const {
1015 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1016 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1017 DestTy.getScalarSizeInBits() == 32 &&
1018 SrcTy.getScalarSizeInBits() == 16 &&
1019 // TODO: This probably only requires no input flushing?
1020 denormalModeIsFlushAllF32(*MI.getMF());
1021}
1022
1024 // SI has some legal vector types, but no legal vector operations. Say no
1025 // shuffles are legal in order to prefer scalarizing some vector operations.
1026 return false;
1027}
1028
1031 EVT VT) const {
1034
1035 if (VT.isVector()) {
1036 EVT ScalarVT = VT.getScalarType();
1037 unsigned Size = ScalarVT.getSizeInBits();
1038 if (Size == 16) {
1039 if (Subtarget->has16BitInsts()) {
1040 if (VT.isInteger())
1041 return MVT::v2i16;
1042 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1043 }
1044 return VT.isInteger() ? MVT::i32 : MVT::f32;
1045 }
1046
1047 if (Size < 16)
1048 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1049 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1050 }
1051
1052 if (VT.getSizeInBits() > 32)
1053 return MVT::i32;
1054
1056}
1057
1060 EVT VT) const {
1063
1064 if (VT.isVector()) {
1065 unsigned NumElts = VT.getVectorNumElements();
1066 EVT ScalarVT = VT.getScalarType();
1067 unsigned Size = ScalarVT.getSizeInBits();
1068
1069 // FIXME: Should probably promote 8-bit vectors to i16.
1070 if (Size == 16 && Subtarget->has16BitInsts())
1071 return (NumElts + 1) / 2;
1072
1073 if (Size <= 32)
1074 return NumElts;
1075
1076 if (Size > 32)
1077 return NumElts * ((Size + 31) / 32);
1078 } else if (VT.getSizeInBits() > 32)
1079 return (VT.getSizeInBits() + 31) / 32;
1080
1082}
1083
1085 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1086 unsigned &NumIntermediates, MVT &RegisterVT) const {
1087 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1088 unsigned NumElts = VT.getVectorNumElements();
1089 EVT ScalarVT = VT.getScalarType();
1090 unsigned Size = ScalarVT.getSizeInBits();
1091 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1092 // support, but unless we can properly handle 3-vectors, it will be still be
1093 // inconsistent.
1094 if (Size == 16 && Subtarget->has16BitInsts()) {
1095 if (ScalarVT == MVT::bf16) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = MVT::v2bf16;
1098 } else {
1099 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1100 IntermediateVT = RegisterVT;
1101 }
1102 NumIntermediates = (NumElts + 1) / 2;
1103 return NumIntermediates;
1104 }
1105
1106 if (Size == 32) {
1107 RegisterVT = ScalarVT.getSimpleVT();
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts;
1110 return NumIntermediates;
1111 }
1112
1113 if (Size < 16 && Subtarget->has16BitInsts()) {
1114 // FIXME: Should probably form v2i16 pieces
1115 RegisterVT = MVT::i16;
1116 IntermediateVT = ScalarVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1119 }
1120
1121 if (Size != 16 && Size <= 32) {
1122 RegisterVT = MVT::i32;
1123 IntermediateVT = ScalarVT;
1124 NumIntermediates = NumElts;
1125 return NumIntermediates;
1126 }
1127
1128 if (Size > 32) {
1129 RegisterVT = MVT::i32;
1130 IntermediateVT = RegisterVT;
1131 NumIntermediates = NumElts * ((Size + 31) / 32);
1132 return NumIntermediates;
1133 }
1134 }
1135
1137 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1138}
1139
1141 const DataLayout &DL, Type *Ty,
1142 unsigned MaxNumLanes) {
1143 assert(MaxNumLanes != 0);
1144
1145 LLVMContext &Ctx = Ty->getContext();
1146 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1147 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1148 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1149 NumElts);
1150 }
1151
1152 return TLI.getValueType(DL, Ty);
1153}
1154
1155// Peek through TFE struct returns to only use the data size.
1157 const DataLayout &DL, Type *Ty,
1158 unsigned MaxNumLanes) {
1159 auto *ST = dyn_cast<StructType>(Ty);
1160 if (!ST)
1161 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1162
1163 // TFE intrinsics return an aggregate type.
1164 assert(ST->getNumContainedTypes() == 2 &&
1165 ST->getContainedType(1)->isIntegerTy(32));
1166 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1167}
1168
1169/// Map address space 7 to MVT::v5i32 because that's its in-memory
1170/// representation. This return value is vector-typed because there is no
1171/// MVT::i160 and it is not clear if one can be added. While this could
1172/// cause issues during codegen, these address space 7 pointers will be
1173/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1174/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1175/// modeling, to work.
1177 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1178 return MVT::v5i32;
1180 DL.getPointerSizeInBits(AS) == 192)
1181 return MVT::v6i32;
1183}
1184/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1185/// v8i32 when padding is added.
1186/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1187/// also v8i32 with padding.
1189 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1190 DL.getPointerSizeInBits(AS) == 160) ||
1192 DL.getPointerSizeInBits(AS) == 192))
1193 return MVT::v8i32;
1195}
1196
1198 const CallInst &CI,
1199 MachineFunction &MF,
1200 unsigned IntrID) const {
1202 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1204
1205 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1207 AttributeList Attr =
1209 MemoryEffects ME = Attr.getMemoryEffects();
1210 if (ME.doesNotAccessMemory())
1211 return false;
1212
1213 // TODO: Should images get their own address space?
1214 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1215
1216 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1217 if (RsrcIntr->IsImage) {
1220 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1221 Info.align.reset();
1222 }
1223
1224 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1225 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1226 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1227 // We conservatively set the memory operand of a buffer intrinsic to the
1228 // base resource pointer, so that we can access alias information about
1229 // those pointers. Cases like "this points at the same value
1230 // but with a different offset" are handled in
1231 // areMemAccessesTriviallyDisjoint.
1232 Info.ptrVal = RsrcArg;
1233 }
1234
1235 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1236 if (!IsSPrefetch) {
1237 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1238 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1240 }
1241
1243 if (ME.onlyReadsMemory()) {
1244 if (RsrcIntr->IsImage) {
1245 unsigned MaxNumLanes = 4;
1246
1247 if (!BaseOpcode->Gather4) {
1248 // If this isn't a gather, we may have excess loaded elements in the
1249 // IR type. Check the dmask for the real number of elements loaded.
1250 unsigned DMask =
1251 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1252 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1253 }
1254
1255 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1256 CI.getType(), MaxNumLanes);
1257 } else {
1258 Info.memVT =
1260 std::numeric_limits<unsigned>::max());
1261 }
1262
1263 // FIXME: What does alignment mean for an image?
1266 } else if (ME.onlyWritesMemory()) {
1268
1269 Type *DataTy = CI.getArgOperand(0)->getType();
1270 if (RsrcIntr->IsImage) {
1271 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1272 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1273 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1274 DMaskLanes);
1275 } else
1276 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1277
1279 } else {
1280 // Atomic, NoReturn Sampler or prefetch
1283 Info.flags |=
1285
1286 if (!IsSPrefetch)
1288
1289 switch (IntrID) {
1290 default:
1291 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1292 // Fake memory access type for no return sampler intrinsics
1293 Info.memVT = MVT::i32;
1294 } else {
1295 // XXX - Should this be volatile without known ordering?
1297 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1298 }
1299 break;
1300 case Intrinsic::amdgcn_raw_buffer_load_lds:
1301 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1302 case Intrinsic::amdgcn_struct_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1304 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1305 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1306 Info.ptrVal = CI.getArgOperand(1);
1307 return true;
1308 }
1309 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1310 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1311 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1313 Info.memVT =
1315 std::numeric_limits<unsigned>::max());
1316 Info.flags &= ~MachineMemOperand::MOStore;
1317 return true;
1318 }
1319 }
1320 }
1321 return true;
1322 }
1323
1324 switch (IntrID) {
1325 case Intrinsic::amdgcn_ds_ordered_add:
1326 case Intrinsic::amdgcn_ds_ordered_swap: {
1328 Info.memVT = MVT::getVT(CI.getType());
1329 Info.ptrVal = CI.getOperand(0);
1330 Info.align.reset();
1332
1333 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1334 if (!Vol->isZero())
1336
1337 return true;
1338 }
1339 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1340 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1342 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1343 Info.ptrVal = nullptr;
1344 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1346 return true;
1347 }
1348 case Intrinsic::amdgcn_ds_append:
1349 case Intrinsic::amdgcn_ds_consume: {
1351 Info.memVT = MVT::getVT(CI.getType());
1352 Info.ptrVal = CI.getOperand(0);
1353 Info.align.reset();
1355
1356 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1357 if (!Vol->isZero())
1359
1360 return true;
1361 }
1362 case Intrinsic::amdgcn_global_atomic_csub: {
1364 Info.memVT = MVT::getVT(CI.getType());
1365 Info.ptrVal = CI.getOperand(0);
1366 Info.align.reset();
1369 return true;
1370 }
1371 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1373 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1374
1375 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1376 Info.align.reset();
1377 Info.flags |=
1379 return true;
1380 }
1381 case Intrinsic::amdgcn_global_atomic_fmin_num:
1382 case Intrinsic::amdgcn_global_atomic_fmax_num:
1383 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1384 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1385 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1386 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1388 Info.memVT = MVT::getVT(CI.getType());
1389 Info.ptrVal = CI.getOperand(0);
1390 Info.align.reset();
1394 return true;
1395 }
1396 case Intrinsic::amdgcn_global_load_tr_b64:
1397 case Intrinsic::amdgcn_global_load_tr_b128:
1398 case Intrinsic::amdgcn_ds_read_tr4_b64:
1399 case Intrinsic::amdgcn_ds_read_tr6_b96:
1400 case Intrinsic::amdgcn_ds_read_tr8_b64:
1401 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1403 Info.memVT = MVT::getVT(CI.getType());
1404 Info.ptrVal = CI.getOperand(0);
1405 Info.align.reset();
1407 return true;
1408 }
1409 case Intrinsic::amdgcn_ds_gws_init:
1410 case Intrinsic::amdgcn_ds_gws_barrier:
1411 case Intrinsic::amdgcn_ds_gws_sema_v:
1412 case Intrinsic::amdgcn_ds_gws_sema_br:
1413 case Intrinsic::amdgcn_ds_gws_sema_p:
1414 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1416
1417 const GCNTargetMachine &TM =
1418 static_cast<const GCNTargetMachine &>(getTargetMachine());
1419
1421 Info.ptrVal = MFI->getGWSPSV(TM);
1422
1423 // This is an abstract access, but we need to specify a type and size.
1424 Info.memVT = MVT::i32;
1425 Info.size = 4;
1426 Info.align = Align(4);
1427
1428 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1430 else
1432 return true;
1433 }
1434 case Intrinsic::amdgcn_global_load_lds: {
1436 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1437 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1438 Info.ptrVal = CI.getArgOperand(1);
1440 return true;
1441 }
1442 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1444
1445 const GCNTargetMachine &TM =
1446 static_cast<const GCNTargetMachine &>(getTargetMachine());
1447
1449 Info.ptrVal = MFI->getGWSPSV(TM);
1450
1451 // This is an abstract access, but we need to specify a type and size.
1452 Info.memVT = MVT::i32;
1453 Info.size = 4;
1454 Info.align = Align(4);
1455
1457 return true;
1458 }
1459 case Intrinsic::amdgcn_s_prefetch_data: {
1461 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1462 Info.ptrVal = CI.getArgOperand(0);
1464 return true;
1465 }
1466 default:
1467 return false;
1468 }
1469}
1470
1472 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1473 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1474 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1475 // The DAG's ValueType loses the addrspaces.
1476 // Add them as 2 extra Constant operands "from" and "to".
1477 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1478 unsigned DstAS = I.getType()->getPointerAddressSpace();
1479 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1480 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1481 break;
1482 }
1483 default:
1484 break;
1485 }
1486}
1487
1490 Type *&AccessTy) const {
1491 Value *Ptr = nullptr;
1492 switch (II->getIntrinsicID()) {
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume:
1496 case Intrinsic::amdgcn_ds_read_tr4_b64:
1497 case Intrinsic::amdgcn_ds_read_tr6_b96:
1498 case Intrinsic::amdgcn_ds_read_tr8_b64:
1499 case Intrinsic::amdgcn_ds_read_tr16_b64:
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap:
1502 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1503 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1504 case Intrinsic::amdgcn_global_atomic_csub:
1505 case Intrinsic::amdgcn_global_atomic_fmax_num:
1506 case Intrinsic::amdgcn_global_atomic_fmin_num:
1507 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1508 case Intrinsic::amdgcn_global_load_tr_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b128:
1510 Ptr = II->getArgOperand(0);
1511 break;
1512 case Intrinsic::amdgcn_global_load_lds:
1513 Ptr = II->getArgOperand(1);
1514 break;
1515 default:
1516 return false;
1517 }
1518 AccessTy = II->getType();
1519 Ops.push_back(Ptr);
1520 return true;
1521}
1522
1524 unsigned AddrSpace) const {
1525 if (!Subtarget->hasFlatInstOffsets()) {
1526 // Flat instructions do not have offsets, and only have the register
1527 // address.
1528 return AM.BaseOffs == 0 && AM.Scale == 0;
1529 }
1530
1531 decltype(SIInstrFlags::FLAT) FlatVariant =
1535
1536 return AM.Scale == 0 &&
1537 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1538 AM.BaseOffs, AddrSpace, FlatVariant));
1539}
1540
1542 if (Subtarget->hasFlatGlobalInsts())
1544
1545 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1546 // Assume the we will use FLAT for all global memory accesses
1547 // on VI.
1548 // FIXME: This assumption is currently wrong. On VI we still use
1549 // MUBUF instructions for the r + i addressing mode. As currently
1550 // implemented, the MUBUF instructions only work on buffer < 4GB.
1551 // It may be possible to support > 4GB buffers with MUBUF instructions,
1552 // by setting the stride value in the resource descriptor which would
1553 // increase the size limit to (stride * 4GB). However, this is risky,
1554 // because it has never been validated.
1556 }
1557
1558 return isLegalMUBUFAddressingMode(AM);
1559}
1560
1561bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1562 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1563 // additionally can do r + r + i with addr64. 32-bit has more addressing
1564 // mode options. Depending on the resource constant, it can also do
1565 // (i64 r0) + (i32 r1) * (i14 i).
1566 //
1567 // Private arrays end up using a scratch buffer most of the time, so also
1568 // assume those use MUBUF instructions. Scratch loads / stores are currently
1569 // implemented as mubuf instructions with offen bit set, so slightly
1570 // different than the normal addr64.
1571 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1572 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1573 return false;
1574
1575 // FIXME: Since we can split immediate into soffset and immediate offset,
1576 // would it make sense to allow any immediate?
1577
1578 switch (AM.Scale) {
1579 case 0: // r + i or just i, depending on HasBaseReg.
1580 return true;
1581 case 1:
1582 return true; // We have r + r or r + i.
1583 case 2:
1584 if (AM.HasBaseReg) {
1585 // Reject 2 * r + r.
1586 return false;
1587 }
1588
1589 // Allow 2 * r as r + r
1590 // Or 2 * r + i is allowed as r + r + i.
1591 return true;
1592 default: // Don't allow n * r
1593 return false;
1594 }
1595}
1596
1598 const AddrMode &AM, Type *Ty,
1599 unsigned AS,
1600 Instruction *I) const {
1601 // No global is ever allowed as a base.
1602 if (AM.BaseGV)
1603 return false;
1604
1605 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1606 return isLegalGlobalAddressingMode(AM);
1607
1608 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1612 // If the offset isn't a multiple of 4, it probably isn't going to be
1613 // correctly aligned.
1614 // FIXME: Can we get the real alignment here?
1615 if (AM.BaseOffs % 4 != 0)
1616 return isLegalMUBUFAddressingMode(AM);
1617
1618 if (!Subtarget->hasScalarSubwordLoads()) {
1619 // There are no SMRD extloads, so if we have to do a small type access we
1620 // will use a MUBUF load.
1621 // FIXME?: We also need to do this if unaligned, but we don't know the
1622 // alignment here.
1623 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1624 return isLegalGlobalAddressingMode(AM);
1625 }
1626
1628 // SMRD instructions have an 8-bit, dword offset on SI.
1629 if (!isUInt<8>(AM.BaseOffs / 4))
1630 return false;
1631 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1632 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1633 // in 8-bits, it can use a smaller encoding.
1634 if (!isUInt<32>(AM.BaseOffs / 4))
1635 return false;
1636 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1637 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1638 if (!isUInt<20>(AM.BaseOffs))
1639 return false;
1640 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1641 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1642 // for S_BUFFER_* instructions).
1643 if (!isInt<21>(AM.BaseOffs))
1644 return false;
1645 } else {
1646 // On GFX12, all offsets are signed 24-bit in bytes.
1647 if (!isInt<24>(AM.BaseOffs))
1648 return false;
1649 }
1650
1651 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1653 AM.BaseOffs < 0) {
1654 // Scalar (non-buffer) loads can only use a negative offset if
1655 // soffset+offset is non-negative. Since the compiler can only prove that
1656 // in a few special cases, it is safer to claim that negative offsets are
1657 // not supported.
1658 return false;
1659 }
1660
1661 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1662 return true;
1663
1664 if (AM.Scale == 1 && AM.HasBaseReg)
1665 return true;
1666
1667 return false;
1668 }
1669
1670 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1671 return Subtarget->enableFlatScratch()
1673 : isLegalMUBUFAddressingMode(AM);
1674
1675 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1676 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1677 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1678 // field.
1679 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1680 // an 8-bit dword offset but we don't know the alignment here.
1681 if (!isUInt<16>(AM.BaseOffs))
1682 return false;
1683
1684 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1685 return true;
1686
1687 if (AM.Scale == 1 && AM.HasBaseReg)
1688 return true;
1689
1690 return false;
1691 }
1692
1694 // For an unknown address space, this usually means that this is for some
1695 // reason being used for pure arithmetic, and not based on some addressing
1696 // computation. We don't have instructions that compute pointers with any
1697 // addressing modes, so treat them as having no offset like flat
1698 // instructions.
1700 }
1701
1702 // Assume a user alias of global for unknown address spaces.
1703 return isLegalGlobalAddressingMode(AM);
1704}
1705
1707 const MachineFunction &MF) const {
1709 return (MemVT.getSizeInBits() <= 4 * 32);
1710 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1711 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1712 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1713 }
1715 return (MemVT.getSizeInBits() <= 2 * 32);
1716 return true;
1717}
1718
1720 unsigned Size, unsigned AddrSpace, Align Alignment,
1721 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1722 if (IsFast)
1723 *IsFast = 0;
1724
1725 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1726 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1727 // Check if alignment requirements for ds_read/write instructions are
1728 // disabled.
1729 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1730 return false;
1731
1732 Align RequiredAlignment(
1733 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1734 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1735 Alignment < RequiredAlignment)
1736 return false;
1737
1738 // Either, the alignment requirements are "enabled", or there is an
1739 // unaligned LDS access related hardware bug though alignment requirements
1740 // are "disabled". In either case, we need to check for proper alignment
1741 // requirements.
1742 //
1743 switch (Size) {
1744 case 64:
1745 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1746 // address is negative, then the instruction is incorrectly treated as
1747 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1748 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1749 // load later in the SILoadStoreOptimizer.
1750 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1751 return false;
1752
1753 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1754 // can do a 4 byte aligned, 8 byte access in a single operation using
1755 // ds_read2/write2_b32 with adjacent offsets.
1756 RequiredAlignment = Align(4);
1757
1758 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1759 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1760 // ds_write2_b32 depending on the alignment. In either case with either
1761 // alignment there is no faster way of doing this.
1762
1763 // The numbers returned here and below are not additive, it is a 'speed
1764 // rank'. They are just meant to be compared to decide if a certain way
1765 // of lowering an operation is faster than another. For that purpose
1766 // naturally aligned operation gets it bitsize to indicate that "it
1767 // operates with a speed comparable to N-bit wide load". With the full
1768 // alignment ds128 is slower than ds96 for example. If underaligned it
1769 // is comparable to a speed of a single dword access, which would then
1770 // mean 32 < 128 and it is faster to issue a wide load regardless.
1771 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1772 // wider load which will not be aligned anymore the latter is slower.
1773 if (IsFast)
1774 *IsFast = (Alignment >= RequiredAlignment) ? 64
1775 : (Alignment < Align(4)) ? 32
1776 : 1;
1777 return true;
1778 }
1779
1780 break;
1781 case 96:
1782 if (!Subtarget->hasDS96AndDS128())
1783 return false;
1784
1785 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1786 // gfx8 and older.
1787
1788 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1789 // Naturally aligned access is fastest. However, also report it is Fast
1790 // if memory is aligned less than DWORD. A narrow load or store will be
1791 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1792 // be more of them, so overall we will pay less penalty issuing a single
1793 // instruction.
1794
1795 // See comment on the values above.
1796 if (IsFast)
1797 *IsFast = (Alignment >= RequiredAlignment) ? 96
1798 : (Alignment < Align(4)) ? 32
1799 : 1;
1800 return true;
1801 }
1802
1803 break;
1804 case 128:
1805 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1806 return false;
1807
1808 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1809 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1810 // single operation using ds_read2/write2_b64.
1811 RequiredAlignment = Align(8);
1812
1813 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1814 // Naturally aligned access is fastest. However, also report it is Fast
1815 // if memory is aligned less than DWORD. A narrow load or store will be
1816 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1817 // will be more of them, so overall we will pay less penalty issuing a
1818 // single instruction.
1819
1820 // See comment on the values above.
1821 if (IsFast)
1822 *IsFast = (Alignment >= RequiredAlignment) ? 128
1823 : (Alignment < Align(4)) ? 32
1824 : 1;
1825 return true;
1826 }
1827
1828 break;
1829 default:
1830 if (Size > 32)
1831 return false;
1832
1833 break;
1834 }
1835
1836 // See comment on the values above.
1837 // Note that we have a single-dword or sub-dword here, so if underaligned
1838 // it is a slowest possible access, hence returned value is 0.
1839 if (IsFast)
1840 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1841
1842 return Alignment >= RequiredAlignment ||
1843 Subtarget->hasUnalignedDSAccessEnabled();
1844 }
1845
1846 // FIXME: We have to be conservative here and assume that flat operations
1847 // will access scratch. If we had access to the IR function, then we
1848 // could determine if any private memory was used in the function.
1849 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1850 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1851 bool AlignedBy4 = Alignment >= Align(4);
1852 if (IsFast)
1853 *IsFast = AlignedBy4;
1854
1855 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1856 }
1857
1858 // So long as they are correct, wide global memory operations perform better
1859 // than multiple smaller memory ops -- even when misaligned
1860 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1861 if (IsFast)
1862 *IsFast = Size;
1863
1864 return Alignment >= Align(4) ||
1866 }
1867
1868 // Smaller than dword value must be aligned.
1869 if (Size < 32)
1870 return false;
1871
1872 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1873 // byte-address are ignored, thus forcing Dword alignment.
1874 // This applies to private, global, and constant memory.
1875 if (IsFast)
1876 *IsFast = 1;
1877
1878 return Size >= 32 && Alignment >= Align(4);
1879}
1880
1882 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1883 unsigned *IsFast) const {
1885 Alignment, Flags, IsFast);
1886}
1887
1889 const MemOp &Op, const AttributeList &FuncAttributes) const {
1890 // FIXME: Should account for address space here.
1891
1892 // The default fallback uses the private pointer size as a guess for a type to
1893 // use. Make sure we switch these to 64-bit accesses.
1894
1895 if (Op.size() >= 16 &&
1896 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1897 return MVT::v4i32;
1898
1899 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1900 return MVT::v2i32;
1901
1902 // Use the default.
1903 return MVT::Other;
1904}
1905
1907 const MemSDNode *MemNode = cast<MemSDNode>(N);
1908 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1909}
1910
1912 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1914}
1915
1917 unsigned DestAS) const {
1918 // Flat -> private/local is a simple truncate.
1919 // Flat -> global is no-op
1920 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1921 return true;
1922
1923 const GCNTargetMachine &TM =
1924 static_cast<const GCNTargetMachine &>(getTargetMachine());
1925 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1926}
1927
1930 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1931 VT.getScalarType().bitsLE(MVT::i16))
1934}
1935
1937 Type *Ty) const {
1938 // FIXME: Could be smarter if called for vector constants.
1939 return true;
1940}
1941
1943 unsigned Index) const {
1945 return false;
1946
1947 // TODO: Add more cases that are cheap.
1948 return Index == 0;
1949}
1950
1952 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1953 switch (Op) {
1954 case ISD::LOAD:
1955 case ISD::STORE:
1956 return true;
1957 default:
1958 return false;
1959 }
1960 }
1961
1962 // SimplifySetCC uses this function to determine whether or not it should
1963 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1964 if (VT == MVT::i1 && Op == ISD::SETCC)
1965 return false;
1966
1968}
1969
1970SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1971 const SDLoc &SL,
1972 SDValue Chain,
1973 uint64_t Offset) const {
1974 const DataLayout &DL = DAG.getDataLayout();
1978
1979 auto [InputPtrReg, RC, ArgTy] =
1981
1982 // We may not have the kernarg segment argument if we have no kernel
1983 // arguments.
1984 if (!InputPtrReg)
1985 return DAG.getConstant(Offset, SL, PtrVT);
1986
1988 SDValue BasePtr = DAG.getCopyFromReg(
1989 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1990
1991 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1992}
1993
1994SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1995 const SDLoc &SL) const {
1998 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1999}
2000
2001SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2002 const SDLoc &SL) const {
2003
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2008 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2009 return SDValue();
2010}
2011
2012SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2013 const SDLoc &SL, SDValue Val,
2014 bool Signed,
2015 const ISD::InputArg *Arg) const {
2016 // First, if it is a widened vector, narrow it.
2017 if (VT.isVector() &&
2019 EVT NarrowedVT =
2022 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2023 DAG.getConstant(0, SL, MVT::i32));
2024 }
2025
2026 // Then convert the vector elements or scalar value.
2027 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2028 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2029 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2030 }
2031
2032 if (MemVT.isFloatingPoint())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2034 else if (Signed)
2035 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2036 else
2037 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2038
2039 return Val;
2040}
2041
2042SDValue SITargetLowering::lowerKernargMemParameter(
2043 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2044 uint64_t Offset, Align Alignment, bool Signed,
2045 const ISD::InputArg *Arg) const {
2047
2048 // Try to avoid using an extload by loading earlier than the argument address,
2049 // and extracting the relevant bits. The load should hopefully be merged with
2050 // the previous argument.
2051 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2052 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2053 int64_t AlignDownOffset = alignDown(Offset, 4);
2054 int64_t OffsetDiff = Offset - AlignDownOffset;
2055
2056 EVT IntVT = MemVT.changeTypeToInteger();
2057
2058 // TODO: If we passed in the base kernel offset we could have a better
2059 // alignment than 4, but we don't really need it.
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2061 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2064
2065 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2066 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2067
2068 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2069 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2071
2072 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2073 }
2074
2075 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2076 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2079
2080 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2081 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2082}
2083
2084SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2085 CCValAssign &VA, const SDLoc &SL,
2086 SDValue Chain,
2087 const ISD::InputArg &Arg) const {
2089 MachineFrameInfo &MFI = MF.getFrameInfo();
2090
2091 if (Arg.Flags.isByVal()) {
2092 unsigned Size = Arg.Flags.getByValSize();
2093 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2094 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2095 }
2096
2097 unsigned ArgOffset = VA.getLocMemOffset();
2098 unsigned ArgSize = VA.getValVT().getStoreSize();
2099
2100 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2101
2102 // Create load nodes to retrieve arguments from the stack.
2103 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2104 SDValue ArgValue;
2105
2106 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2108 MVT MemVT = VA.getValVT();
2109
2110 switch (VA.getLocInfo()) {
2111 default:
2112 break;
2113 case CCValAssign::BCvt:
2114 MemVT = VA.getLocVT();
2115 break;
2116 case CCValAssign::SExt:
2117 ExtType = ISD::SEXTLOAD;
2118 break;
2119 case CCValAssign::ZExt:
2120 ExtType = ISD::ZEXTLOAD;
2121 break;
2122 case CCValAssign::AExt:
2123 ExtType = ISD::EXTLOAD;
2124 break;
2125 }
2126
2127 ArgValue = DAG.getExtLoad(
2128 ExtType, SL, VA.getLocVT(), Chain, FIN,
2130 return ArgValue;
2131}
2132
2133SDValue SITargetLowering::getPreloadedValue(
2134 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2136 const ArgDescriptor *Reg = nullptr;
2137 const TargetRegisterClass *RC;
2138 LLT Ty;
2139
2141 const ArgDescriptor WorkGroupIDX =
2142 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2143 // If GridZ is not programmed in an entry function then the hardware will set
2144 // it to all zeros, so there is no need to mask the GridY value in the low
2145 // order bits.
2146 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2147 AMDGPU::TTMP7,
2148 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2149 const ArgDescriptor WorkGroupIDZ =
2150 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2151 if (Subtarget->hasArchitectedSGPRs() &&
2153 switch (PVID) {
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2157 Ty = LLT::scalar(32);
2158 break;
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2162 Ty = LLT::scalar(32);
2163 break;
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2167 Ty = LLT::scalar(32);
2168 break;
2169 default:
2170 break;
2171 }
2172 }
2173
2174 if (!Reg)
2175 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2176 if (!Reg) {
2178 // It's possible for a kernarg intrinsic call to appear in a kernel with
2179 // no allocated segment, in which case we do not add the user sgpr
2180 // argument, so just return null.
2181 return DAG.getConstant(0, SDLoc(), VT);
2182 }
2183
2184 // It's undefined behavior if a function marked with the amdgpu-no-*
2185 // attributes uses the corresponding intrinsic.
2186 return DAG.getUNDEF(VT);
2187 }
2188
2189 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2190}
2191
2193 CallingConv::ID CallConv,
2194 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2195 FunctionType *FType,
2196 SIMachineFunctionInfo *Info) {
2197 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2198 const ISD::InputArg *Arg = &Ins[I];
2199
2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2201 "vector type argument should have been split");
2202
2203 // First check if it's a PS input addr.
2204 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2205 PSInputNum <= 15) {
2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2207
2208 // Inconveniently only the first part of the split is marked as isSplit,
2209 // so skip to the end. We only want to increment PSInputNum once for the
2210 // entire split argument.
2211 if (Arg->Flags.isSplit()) {
2212 while (!Arg->Flags.isSplitEnd()) {
2213 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2214 "unexpected vector split in ps argument type");
2215 if (!SkipArg)
2216 Splits.push_back(*Arg);
2217 Arg = &Ins[++I];
2218 }
2219 }
2220
2221 if (SkipArg) {
2222 // We can safely skip PS inputs.
2223 Skipped.set(Arg->getOrigArgIndex());
2224 ++PSInputNum;
2225 continue;
2226 }
2227
2228 Info->markPSInputAllocated(PSInputNum);
2229 if (Arg->Used)
2230 Info->markPSInputEnabled(PSInputNum);
2231
2232 ++PSInputNum;
2233 }
2234
2235 Splits.push_back(*Arg);
2236 }
2237}
2238
2239// Allocate special inputs passed in VGPRs.
2241 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2242 SIMachineFunctionInfo &Info) const {
2243 const LLT S32 = LLT::scalar(32);
2245
2246 if (Info.hasWorkItemIDX()) {
2247 Register Reg = AMDGPU::VGPR0;
2248 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2249
2250 CCInfo.AllocateReg(Reg);
2251 unsigned Mask =
2252 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2253 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2254 }
2255
2256 if (Info.hasWorkItemIDY()) {
2257 assert(Info.hasWorkItemIDX());
2258 if (Subtarget->hasPackedTID()) {
2259 Info.setWorkItemIDY(
2260 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2261 } else {
2262 unsigned Reg = AMDGPU::VGPR1;
2263 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2264
2265 CCInfo.AllocateReg(Reg);
2266 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2267 }
2268 }
2269
2270 if (Info.hasWorkItemIDZ()) {
2271 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2272 if (Subtarget->hasPackedTID()) {
2273 Info.setWorkItemIDZ(
2274 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2275 } else {
2276 unsigned Reg = AMDGPU::VGPR2;
2277 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2278
2279 CCInfo.AllocateReg(Reg);
2280 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2281 }
2282 }
2283}
2284
2285// Try to allocate a VGPR at the end of the argument list, or if no argument
2286// VGPRs are left allocating a stack slot.
2287// If \p Mask is is given it indicates bitfield position in the register.
2288// If \p Arg is given use it with new ]p Mask instead of allocating new.
2289static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2290 ArgDescriptor Arg = ArgDescriptor()) {
2291 if (Arg.isSet())
2292 return ArgDescriptor::createArg(Arg, Mask);
2293
2294 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2295 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2296 if (RegIdx == ArgVGPRs.size()) {
2297 // Spill to stack required.
2298 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2299
2300 return ArgDescriptor::createStack(Offset, Mask);
2301 }
2302
2303 unsigned Reg = ArgVGPRs[RegIdx];
2304 Reg = CCInfo.AllocateReg(Reg);
2305 assert(Reg != AMDGPU::NoRegister);
2306
2307 MachineFunction &MF = CCInfo.getMachineFunction();
2308 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2309 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2310 return ArgDescriptor::createRegister(Reg, Mask);
2311}
2312
2314 const TargetRegisterClass *RC,
2315 unsigned NumArgRegs) {
2316 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2317 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2318 if (RegIdx == ArgSGPRs.size())
2319 report_fatal_error("ran out of SGPRs for arguments");
2320
2321 unsigned Reg = ArgSGPRs[RegIdx];
2322 Reg = CCInfo.AllocateReg(Reg);
2323 assert(Reg != AMDGPU::NoRegister);
2324
2325 MachineFunction &MF = CCInfo.getMachineFunction();
2326 MF.addLiveIn(Reg, RC);
2328}
2329
2330// If this has a fixed position, we still should allocate the register in the
2331// CCInfo state. Technically we could get away with this for values passed
2332// outside of the normal argument range.
2334 const TargetRegisterClass *RC,
2335 MCRegister Reg) {
2336 Reg = CCInfo.AllocateReg(Reg);
2337 assert(Reg != AMDGPU::NoRegister);
2338 MachineFunction &MF = CCInfo.getMachineFunction();
2339 MF.addLiveIn(Reg, RC);
2340}
2341
2342static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2343 if (Arg) {
2344 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2345 Arg.getRegister());
2346 } else
2347 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2348}
2349
2350static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2351 if (Arg) {
2352 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2353 Arg.getRegister());
2354 } else
2355 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2356}
2357
2358/// Allocate implicit function VGPR arguments at the end of allocated user
2359/// arguments.
2361 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2362 SIMachineFunctionInfo &Info) const {
2363 const unsigned Mask = 0x3ff;
2364 ArgDescriptor Arg;
2365
2366 if (Info.hasWorkItemIDX()) {
2367 Arg = allocateVGPR32Input(CCInfo, Mask);
2368 Info.setWorkItemIDX(Arg);
2369 }
2370
2371 if (Info.hasWorkItemIDY()) {
2372 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2373 Info.setWorkItemIDY(Arg);
2374 }
2375
2376 if (Info.hasWorkItemIDZ())
2377 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2378}
2379
2380/// Allocate implicit function VGPR arguments in fixed registers.
2382 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2383 SIMachineFunctionInfo &Info) const {
2384 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2385 if (!Reg)
2386 report_fatal_error("failed to allocated VGPR for implicit arguments");
2387
2388 const unsigned Mask = 0x3ff;
2389 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2390 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2391 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2392}
2393
2395 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2396 SIMachineFunctionInfo &Info) const {
2397 auto &ArgInfo = Info.getArgInfo();
2398 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2399
2400 // TODO: Unify handling with private memory pointers.
2401 if (UserSGPRInfo.hasDispatchPtr())
2402 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2403
2404 if (UserSGPRInfo.hasQueuePtr())
2405 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2406
2407 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2408 // constant offset from the kernarg segment.
2409 if (Info.hasImplicitArgPtr())
2410 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2411
2412 if (UserSGPRInfo.hasDispatchID())
2413 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2414
2415 // flat_scratch_init is not applicable for non-kernel functions.
2416
2417 if (Info.hasWorkGroupIDX())
2418 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2419
2420 if (Info.hasWorkGroupIDY())
2421 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2422
2423 if (Info.hasWorkGroupIDZ())
2424 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2425
2426 if (Info.hasLDSKernelId())
2427 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2428}
2429
2430// Allocate special inputs passed in user SGPRs.
2432 MachineFunction &MF,
2433 const SIRegisterInfo &TRI,
2434 SIMachineFunctionInfo &Info) const {
2435 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2436 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2437 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2438 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2439 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2440 }
2441
2442 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2443 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2444 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2445 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2446 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2447 }
2448
2449 if (UserSGPRInfo.hasDispatchPtr()) {
2450 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2451 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2452 CCInfo.AllocateReg(DispatchPtrReg);
2453 }
2454
2455 if (UserSGPRInfo.hasQueuePtr()) {
2456 Register QueuePtrReg = Info.addQueuePtr(TRI);
2457 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2458 CCInfo.AllocateReg(QueuePtrReg);
2459 }
2460
2461 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2463 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2464 CCInfo.AllocateReg(InputPtrReg);
2465
2466 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2467 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2468 }
2469
2470 if (UserSGPRInfo.hasDispatchID()) {
2471 Register DispatchIDReg = Info.addDispatchID(TRI);
2472 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2473 CCInfo.AllocateReg(DispatchIDReg);
2474 }
2475
2476 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2477 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2478 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2479 CCInfo.AllocateReg(FlatScratchInitReg);
2480 }
2481
2482 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2483 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2484 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2485 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2486 }
2487
2488 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2489 // these from the dispatch pointer.
2490}
2491
2492// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2493// sequential starting from the first argument.
2495 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2497 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2498 Function &F = MF.getFunction();
2499 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2500 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2501 bool InPreloadSequence = true;
2502 unsigned InIdx = 0;
2503 bool AlignedForImplictArgs = false;
2504 unsigned ImplicitArgOffset = 0;
2505 for (auto &Arg : F.args()) {
2506 if (!InPreloadSequence || !Arg.hasInRegAttr())
2507 break;
2508
2509 unsigned ArgIdx = Arg.getArgNo();
2510 // Don't preload non-original args or parts not in the current preload
2511 // sequence.
2512 if (InIdx < Ins.size() &&
2513 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2514 break;
2515
2516 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2517 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2518 InIdx++) {
2519 assert(ArgLocs[ArgIdx].isMemLoc());
2520 auto &ArgLoc = ArgLocs[InIdx];
2521 const Align KernelArgBaseAlign = Align(16);
2522 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2523 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2524 unsigned NumAllocSGPRs =
2525 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2526
2527 // Fix alignment for hidden arguments.
2528 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2529 if (!AlignedForImplictArgs) {
2530 ImplicitArgOffset =
2531 alignTo(LastExplicitArgOffset,
2532 Subtarget->getAlignmentForImplicitArgPtr()) -
2533 LastExplicitArgOffset;
2534 AlignedForImplictArgs = true;
2535 }
2536 ArgOffset += ImplicitArgOffset;
2537 }
2538
2539 // Arg is preloaded into the previous SGPR.
2540 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2541 assert(InIdx >= 1 && "No previous SGPR");
2542 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2543 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2544 continue;
2545 }
2546
2547 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2548 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2549 // Check for free user SGPRs for preloading.
2550 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2551 InPreloadSequence = false;
2552 break;
2553 }
2554
2555 // Preload this argument.
2556 const TargetRegisterClass *RC =
2557 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2558 SmallVectorImpl<MCRegister> *PreloadRegs =
2559 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2560
2561 if (PreloadRegs->size() > 1)
2562 RC = &AMDGPU::SGPR_32RegClass;
2563 for (auto &Reg : *PreloadRegs) {
2564 assert(Reg);
2565 MF.addLiveIn(Reg, RC);
2566 CCInfo.AllocateReg(Reg);
2567 }
2568
2569 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2570 }
2571 }
2572}
2573
2575 const SIRegisterInfo &TRI,
2576 SIMachineFunctionInfo &Info) const {
2577 // Always allocate this last since it is a synthetic preload.
2578 if (Info.hasLDSKernelId()) {
2579 Register Reg = Info.addLDSKernelId();
2580 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2581 CCInfo.AllocateReg(Reg);
2582 }
2583}
2584
2585// Allocate special input registers that are initialized per-wave.
2588 CallingConv::ID CallConv,
2589 bool IsShader) const {
2590 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2591 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2592 // Note: user SGPRs are handled by the front-end for graphics shaders
2593 // Pad up the used user SGPRs with dead inputs.
2594
2595 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2596 // before enabling architected SGPRs for workgroup IDs.
2597 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2598
2599 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2600 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2601 // rely on it to reach 16 since if we end up having no stack usage, it will
2602 // not really be added.
2603 unsigned NumRequiredSystemSGPRs =
2604 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2605 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2606 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2607 Register Reg = Info.addReservedUserSGPR();
2608 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2609 CCInfo.AllocateReg(Reg);
2610 }
2611 }
2612
2613 if (!HasArchitectedSGPRs) {
2614 if (Info.hasWorkGroupIDX()) {
2615 Register Reg = Info.addWorkGroupIDX();
2616 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2617 CCInfo.AllocateReg(Reg);
2618 }
2619
2620 if (Info.hasWorkGroupIDY()) {
2621 Register Reg = Info.addWorkGroupIDY();
2622 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.AllocateReg(Reg);
2624 }
2625
2626 if (Info.hasWorkGroupIDZ()) {
2627 Register Reg = Info.addWorkGroupIDZ();
2628 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2629 CCInfo.AllocateReg(Reg);
2630 }
2631 }
2632
2633 if (Info.hasWorkGroupInfo()) {
2634 Register Reg = Info.addWorkGroupInfo();
2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636 CCInfo.AllocateReg(Reg);
2637 }
2638
2639 if (Info.hasPrivateSegmentWaveByteOffset()) {
2640 // Scratch wave offset passed in system SGPR.
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2642
2643 if (IsShader) {
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2646
2647 // This is true if the scratch wave byte offset doesn't have a fixed
2648 // location.
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2650 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2652 }
2653 } else
2654 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2655
2656 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2658 }
2659
2660 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2661 Info.getNumPreloadedSGPRs() >= 16);
2662}
2663
2665 MachineFunction &MF,
2666 const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) {
2668 // Now that we've figured out where the scratch register inputs are, see if
2669 // should reserve the arguments and use them directly.
2670 MachineFrameInfo &MFI = MF.getFrameInfo();
2671 bool HasStackObjects = MFI.hasStackObjects();
2672 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2673
2674 // Record that we know we have non-spill stack objects so we don't need to
2675 // check all stack objects later.
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(true);
2678
2679 // Everything live out of a block is spilled with fast regalloc, so it's
2680 // almost certain that spilling will be required.
2681 if (TM.getOptLevel() == CodeGenOptLevel::None)
2682 HasStackObjects = true;
2683
2684 // For now assume stack access is needed in any callee functions, so we need
2685 // the scratch registers to pass in.
2686 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2687
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2690 // If we have stack objects, we unquestionably need the private buffer
2691 // resource. For the Code Object V2 ABI, this will be the first 4 user
2692 // SGPR inputs. We can reserve those and use them directly.
2693
2694 Register PrivateSegmentBufferReg =
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2697 } else {
2698 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2699 // We tentatively reserve the last registers (skipping the last registers
2700 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2701 // we'll replace these with the ones immediately after those which were
2702 // really allocated. In the prologue copies will be inserted from the
2703 // argument to these reserved registers.
2704
2705 // Without HSA, relocations are used for the scratch pointer and the
2706 // buffer resource setup is always inserted in the prologue. Scratch wave
2707 // offset is still in an input SGPR.
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2709 }
2710 }
2711
2713
2714 // For entry functions we have to set up the stack pointer if we use it,
2715 // whereas non-entry functions get this "for free". This means there is no
2716 // intrinsic advantage to using S32 over S34 in cases where we do not have
2717 // calls but do need a frame pointer (i.e. if we are requested to have one
2718 // because frame pointer elimination is disabled). To keep things simple we
2719 // only ever use S32 as the call ABI stack pointer, and so using it does not
2720 // imply we need a separate frame pointer.
2721 //
2722 // Try to use s32 as the SP, but move it if it would interfere with input
2723 // arguments. This won't work with calls though.
2724 //
2725 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2726 // registers.
2727 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2729 } else {
2731
2732 if (MFI.hasCalls())
2733 report_fatal_error("call in graphics shader with too many input SGPRs");
2734
2735 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2738 break;
2739 }
2740 }
2741
2742 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2743 report_fatal_error("failed to find register for SP");
2744 }
2745
2746 // hasFP should be accurate for entry functions even before the frame is
2747 // finalized, because it does not rely on the known stack size, only
2748 // properties like whether variable sized objects are present.
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2751 }
2752}
2753
2756 return !Info->isEntryFunction();
2757}
2758
2760
2762 MachineBasicBlock *Entry,
2763 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2765
2766 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2767 if (!IStart)
2768 return;
2769
2770 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2771 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2772 MachineBasicBlock::iterator MBBI = Entry->begin();
2773 for (const MCPhysReg *I = IStart; *I; ++I) {
2774 const TargetRegisterClass *RC = nullptr;
2775 if (AMDGPU::SReg_64RegClass.contains(*I))
2776 RC = &AMDGPU::SGPR_64RegClass;
2777 else if (AMDGPU::SReg_32RegClass.contains(*I))
2778 RC = &AMDGPU::SGPR_32RegClass;
2779 else
2780 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2781
2782 Register NewVR = MRI->createVirtualRegister(RC);
2783 // Create copy from CSR to a virtual register.
2784 Entry->addLiveIn(*I);
2785 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2786 .addReg(*I);
2787
2788 // Insert the copy-back instructions right before the terminator.
2789 for (auto *Exit : Exits)
2790 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2791 TII->get(TargetOpcode::COPY), *I)
2792 .addReg(NewVR);
2793 }
2794}
2795
2797 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2798 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2799 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2801
2803 const Function &Fn = MF.getFunction();
2806
2807 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2808 DiagnosticInfoUnsupported NoGraphicsHSA(
2809 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2810 DAG.getContext()->diagnose(NoGraphicsHSA);
2811 return DAG.getEntryNode();
2812 }
2813
2816 BitVector Skipped(Ins.size());
2817 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2818 *DAG.getContext());
2819
2820 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2821 bool IsKernel = AMDGPU::isKernel(CallConv);
2822 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2823
2824 if (IsGraphics) {
2825 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2826 assert(!UserSGPRInfo.hasDispatchPtr() &&
2827 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2828 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2829 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2830 (void)UserSGPRInfo;
2831 if (!Subtarget->enableFlatScratch())
2832 assert(!UserSGPRInfo.hasFlatScratchInit());
2833 if ((CallConv != CallingConv::AMDGPU_CS &&
2834 CallConv != CallingConv::AMDGPU_Gfx) ||
2835 !Subtarget->hasArchitectedSGPRs())
2836 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2837 !Info->hasWorkGroupIDZ());
2838 }
2839
2840 if (CallConv == CallingConv::AMDGPU_PS) {
2841 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2842
2843 // At least one interpolation mode must be enabled or else the GPU will
2844 // hang.
2845 //
2846 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2847 // set PSInputAddr, the user wants to enable some bits after the compilation
2848 // based on run-time states. Since we can't know what the final PSInputEna
2849 // will look like, so we shouldn't do anything here and the user should take
2850 // responsibility for the correct programming.
2851 //
2852 // Otherwise, the following restrictions apply:
2853 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2854 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2855 // enabled too.
2856 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2857 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2858 CCInfo.AllocateReg(AMDGPU::VGPR0);
2859 CCInfo.AllocateReg(AMDGPU::VGPR1);
2860 Info->markPSInputAllocated(0);
2861 Info->markPSInputEnabled(0);
2862 }
2863 if (Subtarget->isAmdPalOS()) {
2864 // For isAmdPalOS, the user does not enable some bits after compilation
2865 // based on run-time states; the register values being generated here are
2866 // the final ones set in hardware. Therefore we need to apply the
2867 // workaround to PSInputAddr and PSInputEnable together. (The case where
2868 // a bit is set in PSInputAddr but not PSInputEnable is where the
2869 // frontend set up an input arg for a particular interpolation mode, but
2870 // nothing uses that input arg. Really we should have an earlier pass
2871 // that removes such an arg.)
2872 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2873 if ((PsInputBits & 0x7F) == 0 ||
2874 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2875 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2876 }
2877 } else if (IsKernel) {
2878 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2879 } else {
2880 Splits.append(Ins.begin(), Ins.end());
2881 }
2882
2883 if (IsKernel)
2884 analyzeFormalArgumentsCompute(CCInfo, Ins);
2885
2886 if (IsEntryFunc) {
2887 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2888 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2889 if (IsKernel && Subtarget->hasKernargPreload())
2890 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2891
2892 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2893 } else if (!IsGraphics) {
2894 // For the fixed ABI, pass workitem IDs in the last argument register.
2895 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2896
2897 // FIXME: Sink this into allocateSpecialInputSGPRs
2898 if (!Subtarget->enableFlatScratch())
2899 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2900
2901 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2902 }
2903
2904 if (!IsKernel) {
2905 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2906 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2907 }
2908
2910
2911 // FIXME: This is the minimum kernel argument alignment. We should improve
2912 // this to the maximum alignment of the arguments.
2913 //
2914 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2915 // kern arg offset.
2916 const Align KernelArgBaseAlign = Align(16);
2917
2918 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2919 const ISD::InputArg &Arg = Ins[i];
2920 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2921 InVals.push_back(DAG.getUNDEF(Arg.VT));
2922 continue;
2923 }
2924
2925 CCValAssign &VA = ArgLocs[ArgIdx++];
2926 MVT VT = VA.getLocVT();
2927
2928 if (IsEntryFunc && VA.isMemLoc()) {
2929 VT = Ins[i].VT;
2930 EVT MemVT = VA.getLocVT();
2931
2932 const uint64_t Offset = VA.getLocMemOffset();
2933 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2934
2935 if (Arg.Flags.isByRef()) {
2936 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2937
2938 const GCNTargetMachine &TM =
2939 static_cast<const GCNTargetMachine &>(getTargetMachine());
2940 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2941 Arg.Flags.getPointerAddrSpace())) {
2944 }
2945
2946 InVals.push_back(Ptr);
2947 continue;
2948 }
2949
2950 SDValue NewArg;
2951 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2952 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2953 // In this case the argument is packed into the previous preload SGPR.
2954 int64_t AlignDownOffset = alignDown(Offset, 4);
2955 int64_t OffsetDiff = Offset - AlignDownOffset;
2956 EVT IntVT = MemVT.changeTypeToInteger();
2957
2961 Register Reg =
2962 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2963
2964 assert(Reg);
2965 Register VReg = MRI.getLiveInVirtReg(Reg);
2966 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2967
2968 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2969 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2970
2971 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2972 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2973 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2974 Ins[i].Flags.isSExt(), &Ins[i]);
2975
2976 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2977 } else {
2981 const SmallVectorImpl<MCRegister> &PreloadRegs =
2982 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2983
2984 SDValue Copy;
2985 if (PreloadRegs.size() == 1) {
2986 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2987 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2988 NewArg = DAG.getCopyFromReg(
2989 Chain, DL, VReg,
2991 TRI->getRegSizeInBits(*RC)));
2992
2993 } else {
2994 // If the kernarg alignment does not match the alignment of the SGPR
2995 // tuple RC that can accommodate this argument, it will be built up
2996 // via copies from from the individual SGPRs that the argument was
2997 // preloaded to.
2999 for (auto Reg : PreloadRegs) {
3000 Register VReg = MRI.getLiveInVirtReg(Reg);
3001 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3002 Elts.push_back(Copy);
3003 }
3004 NewArg =
3005 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3006 PreloadRegs.size()),
3007 DL, Elts);
3008 }
3009
3010 // If the argument was preloaded to multiple consecutive 32-bit
3011 // registers because of misalignment between addressable SGPR tuples
3012 // and the argument size, we can still assume that because of kernarg
3013 // segment alignment restrictions that NewArg's size is the same as
3014 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3015 // truncate since we cannot preload to less than a single SGPR and the
3016 // MemVT may be smaller.
3017 EVT MemVTInt =
3019 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3020 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3021
3022 NewArg = DAG.getBitcast(MemVT, NewArg);
3023 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3024 Ins[i].Flags.isSExt(), &Ins[i]);
3025 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3026 }
3027 } else {
3028 // Hidden arguments that are in the kernel signature must be preloaded
3029 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3030 // the argument list and is not preloaded.
3031 if (Arg.isOrigArg()) {
3032 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3033 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3034 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3035 *OrigArg->getParent(),
3036 "hidden argument in kernel signature was not preloaded",
3037 DL.getDebugLoc());
3038 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3039 }
3040 }
3041
3042 NewArg =
3043 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3044 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3045 }
3046 Chains.push_back(NewArg.getValue(1));
3047
3048 auto *ParamTy =
3049 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3051 ParamTy &&
3052 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3053 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3054 // On SI local pointers are just offsets into LDS, so they are always
3055 // less than 16-bits. On CI and newer they could potentially be
3056 // real pointers, so we can't guarantee their size.
3057 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3058 DAG.getValueType(MVT::i16));
3059 }
3060
3061 InVals.push_back(NewArg);
3062 continue;
3063 }
3064 if (!IsEntryFunc && VA.isMemLoc()) {
3065 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3066 InVals.push_back(Val);
3067 if (!Arg.Flags.isByVal())
3068 Chains.push_back(Val.getValue(1));
3069 continue;
3070 }
3071
3072 assert(VA.isRegLoc() && "Parameter must be in a register!");
3073
3074 Register Reg = VA.getLocReg();
3075 const TargetRegisterClass *RC = nullptr;
3076 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3077 RC = &AMDGPU::VGPR_32RegClass;
3078 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3079 RC = &AMDGPU::SGPR_32RegClass;
3080 else
3081 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3082 EVT ValVT = VA.getValVT();
3083
3084 Reg = MF.addLiveIn(Reg, RC);
3085 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3086
3087 if (Arg.Flags.isSRet()) {
3088 // The return object should be reasonably addressable.
3089
3090 // FIXME: This helps when the return is a real sret. If it is a
3091 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3092 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3093 unsigned NumBits =
3095 Val = DAG.getNode(
3096 ISD::AssertZext, DL, VT, Val,
3097 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3098 }
3099
3100 // If this is an 8 or 16-bit value, it is really passed promoted
3101 // to 32 bits. Insert an assert[sz]ext to capture this, then
3102 // truncate to the right size.
3103 switch (VA.getLocInfo()) {
3104 case CCValAssign::Full:
3105 break;
3106 case CCValAssign::BCvt:
3107 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3108 break;
3109 case CCValAssign::SExt:
3110 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3111 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3112 break;
3113 case CCValAssign::ZExt:
3114 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3115 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3116 break;
3117 case CCValAssign::AExt:
3118 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3119 break;
3120 default:
3121 llvm_unreachable("Unknown loc info!");
3122 }
3123
3124 InVals.push_back(Val);
3125 }
3126
3127 // Start adding system SGPRs.
3128 if (IsEntryFunc)
3129 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3130
3131 // DAG.getPass() returns nullptr when using new pass manager.
3132 // TODO: Use DAG.getMFAM() to access analysis result.
3133 if (DAG.getPass()) {
3134 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3135 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3136 }
3137
3138 unsigned StackArgSize = CCInfo.getStackSize();
3139 Info->setBytesInStackArgArea(StackArgSize);
3140
3141 return Chains.empty() ? Chain
3142 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3143}
3144
3145// TODO: If return values can't fit in registers, we should return as many as
3146// possible in registers before passing on stack.
3148 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3149 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3150 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3151 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3152 // for shaders. Vector types should be explicitly handled by CC.
3153 if (AMDGPU::isEntryFunctionCC(CallConv))
3154 return true;
3155
3157 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3158 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3159 return false;
3160
3161 // We must use the stack if return would require unavailable registers.
3162 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3163 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3164 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3165 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3166 return false;
3167
3168 return true;
3169}
3170
3171SDValue
3173 bool isVarArg,
3175 const SmallVectorImpl<SDValue> &OutVals,
3176 const SDLoc &DL, SelectionDAG &DAG) const {
3179
3180 if (AMDGPU::isKernel(CallConv)) {
3181 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3182 OutVals, DL, DAG);
3183 }
3184
3185 bool IsShader = AMDGPU::isShader(CallConv);
3186
3187 Info->setIfReturnsVoid(Outs.empty());
3188 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3189
3190 // CCValAssign - represent the assignment of the return value to a location.
3193
3194 // CCState - Info about the registers and stack slots.
3195 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3196 *DAG.getContext());
3197
3198 // Analyze outgoing return values.
3199 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3200
3201 SDValue Glue;
3203 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3204
3205 // Copy the result values into the output registers.
3206 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3207 ++I, ++RealRVLocIdx) {
3208 CCValAssign &VA = RVLocs[I];
3209 assert(VA.isRegLoc() && "Can only return in registers!");
3210 // TODO: Partially return in registers if return values don't fit.
3211 SDValue Arg = OutVals[RealRVLocIdx];
3212
3213 // Copied from other backends.
3214 switch (VA.getLocInfo()) {
3215 case CCValAssign::Full:
3216 break;
3217 case CCValAssign::BCvt:
3218 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3219 break;
3220 case CCValAssign::SExt:
3221 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3222 break;
3223 case CCValAssign::ZExt:
3224 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3225 break;
3226 case CCValAssign::AExt:
3227 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3228 break;
3229 default:
3230 llvm_unreachable("Unknown loc info!");
3231 }
3232
3233 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3234 Glue = Chain.getValue(1);
3235 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3236 }
3237
3238 // FIXME: Does sret work properly?
3239 if (!Info->isEntryFunction()) {
3240 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3241 const MCPhysReg *I =
3242 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3243 if (I) {
3244 for (; *I; ++I) {
3245 if (AMDGPU::SReg_64RegClass.contains(*I))
3246 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3247 else if (AMDGPU::SReg_32RegClass.contains(*I))
3248 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3249 else
3250 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3251 }
3252 }
3253 }
3254
3255 // Update chain and glue.
3256 RetOps[0] = Chain;
3257 if (Glue.getNode())
3258 RetOps.push_back(Glue);
3259
3260 unsigned Opc = AMDGPUISD::ENDPGM;
3261 if (!IsWaveEnd)
3263 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3264}
3265
3267 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3268 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3269 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3270 SDValue ThisVal) const {
3271 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3272
3273 // Assign locations to each value returned by this call.
3275 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3276 *DAG.getContext());
3277 CCInfo.AnalyzeCallResult(Ins, RetCC);
3278
3279 // Copy all of the result registers out of their specified physreg.
3280 for (CCValAssign VA : RVLocs) {
3281 SDValue Val;
3282
3283 if (VA.isRegLoc()) {
3284 Val =
3285 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3286 Chain = Val.getValue(1);
3287 InGlue = Val.getValue(2);
3288 } else if (VA.isMemLoc()) {
3289 report_fatal_error("TODO: return values in memory");
3290 } else
3291 llvm_unreachable("unknown argument location type");
3292
3293 switch (VA.getLocInfo()) {
3294 case CCValAssign::Full:
3295 break;
3296 case CCValAssign::BCvt:
3297 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3298 break;
3299 case CCValAssign::ZExt:
3300 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3301 DAG.getValueType(VA.getValVT()));
3302 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3303 break;
3304 case CCValAssign::SExt:
3305 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3306 DAG.getValueType(VA.getValVT()));
3307 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3308 break;
3309 case CCValAssign::AExt:
3310 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3311 break;
3312 default:
3313 llvm_unreachable("Unknown loc info!");
3314 }
3315
3316 InVals.push_back(Val);
3317 }
3318
3319 return Chain;
3320}
3321
3322// Add code to pass special inputs required depending on used features separate
3323// from the explicit user arguments present in the IR.
3325 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3326 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3327 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3328 // If we don't have a call site, this was a call inserted by
3329 // legalization. These can never use special inputs.
3330 if (!CLI.CB)
3331 return;
3332
3333 SelectionDAG &DAG = CLI.DAG;
3334 const SDLoc &DL = CLI.DL;
3335 const Function &F = DAG.getMachineFunction().getFunction();
3336
3337 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3338 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3339
3340 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3342 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3343 // DAG.getPass() returns nullptr when using new pass manager.
3344 // TODO: Use DAG.getMFAM() to access analysis result.
3345 if (DAG.getPass()) {
3346 auto &ArgUsageInfo =
3348 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3349 }
3350 }
3351
3352 // TODO: Unify with private memory register handling. This is complicated by
3353 // the fact that at least in kernels, the input argument is not necessarily
3354 // in the same location as the input.
3355 // clang-format off
3356 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3358 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3359 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3360 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3361 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3362 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3363 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3364 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3365 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3366 };
3367 // clang-format on
3368
3369 for (auto [InputID, Attr] : ImplicitAttrs) {
3370 // If the callee does not use the attribute value, skip copying the value.
3371 if (CLI.CB->hasFnAttr(Attr))
3372 continue;
3373
3374 const auto [OutgoingArg, ArgRC, ArgTy] =
3375 CalleeArgInfo->getPreloadedValue(InputID);
3376 if (!OutgoingArg)
3377 continue;
3378
3379 const auto [IncomingArg, IncomingArgRC, Ty] =
3380 CallerArgInfo.getPreloadedValue(InputID);
3381 assert(IncomingArgRC == ArgRC);
3382
3383 // All special arguments are ints for now.
3384 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3385 SDValue InputReg;
3386
3387 if (IncomingArg) {
3388 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3389 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3390 // The implicit arg ptr is special because it doesn't have a corresponding
3391 // input for kernels, and is computed from the kernarg segment pointer.
3392 InputReg = getImplicitArgPtr(DAG, DL);
3393 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3394 std::optional<uint32_t> Id =
3396 if (Id.has_value()) {
3397 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3398 } else {
3399 InputReg = DAG.getUNDEF(ArgVT);
3400 }
3401 } else {
3402 // We may have proven the input wasn't needed, although the ABI is
3403 // requiring it. We just need to allocate the register appropriately.
3404 InputReg = DAG.getUNDEF(ArgVT);
3405 }
3406
3407 if (OutgoingArg->isRegister()) {
3408 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3409 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3410 report_fatal_error("failed to allocate implicit input argument");
3411 } else {
3412 unsigned SpecialArgOffset =
3413 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3414 SDValue ArgStore =
3415 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3416 MemOpChains.push_back(ArgStore);
3417 }
3418 }
3419
3420 // Pack workitem IDs into a single register or pass it as is if already
3421 // packed.
3422
3423 auto [OutgoingArg, ArgRC, Ty] =
3425 if (!OutgoingArg)
3426 std::tie(OutgoingArg, ArgRC, Ty) =
3428 if (!OutgoingArg)
3429 std::tie(OutgoingArg, ArgRC, Ty) =
3431 if (!OutgoingArg)
3432 return;
3433
3434 const ArgDescriptor *IncomingArgX = std::get<0>(
3436 const ArgDescriptor *IncomingArgY = std::get<0>(
3438 const ArgDescriptor *IncomingArgZ = std::get<0>(
3440
3441 SDValue InputReg;
3442 SDLoc SL;
3443
3444 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3445 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3446 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3447
3448 // If incoming ids are not packed we need to pack them.
3449 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3450 NeedWorkItemIDX) {
3451 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3452 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3453 } else {
3454 InputReg = DAG.getConstant(0, DL, MVT::i32);
3455 }
3456 }
3457
3458 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3459 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3460 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3461 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3462 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3463 InputReg = InputReg.getNode()
3464 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3465 : Y;
3466 }
3467
3468 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3469 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3470 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3471 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3472 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3473 InputReg = InputReg.getNode()
3474 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3475 : Z;
3476 }
3477
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3480 // We're in a situation where the outgoing function requires the workitem
3481 // ID, but the calling function does not have it (e.g a graphics function
3482 // calling a C calling convention function). This is illegal, but we need
3483 // to produce something.
3484 InputReg = DAG.getUNDEF(MVT::i32);
3485 } else {
3486 // Workitem ids are already packed, any of present incoming arguments
3487 // will carry all required fields.
3488 ArgDescriptor IncomingArg =
3489 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3490 : IncomingArgY ? *IncomingArgY
3491 : *IncomingArgZ,
3492 ~0u);
3493 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3494 }
3495 }
3496
3497 if (OutgoingArg->isRegister()) {
3498 if (InputReg)
3499 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3500
3501 CCInfo.AllocateReg(OutgoingArg->getRegister());
3502 } else {
3503 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3504 if (InputReg) {
3505 SDValue ArgStore =
3506 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3507 MemOpChains.push_back(ArgStore);
3508 }
3509 }
3510}
3511
3513 return CC == CallingConv::Fast;
3514}
3515
3516/// Return true if we might ever do TCO for calls with this calling convention.
3518 switch (CC) {
3519 case CallingConv::C:
3521 return true;
3522 default:
3523 return canGuaranteeTCO(CC);
3524 }
3525}
3526
3528 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3530 const SmallVectorImpl<SDValue> &OutVals,
3531 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3532 if (AMDGPU::isChainCC(CalleeCC))
3533 return true;
3534
3535 if (!mayTailCallThisCC(CalleeCC))
3536 return false;
3537
3538 // For a divergent call target, we need to do a waterfall loop over the
3539 // possible callees which precludes us from using a simple jump.
3540 if (Callee->isDivergent())
3541 return false;
3542
3544 const Function &CallerF = MF.getFunction();
3545 CallingConv::ID CallerCC = CallerF.getCallingConv();
3547 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3548
3549 // Kernels aren't callable, and don't have a live in return address so it
3550 // doesn't make sense to do a tail call with entry functions.
3551 if (!CallerPreserved)
3552 return false;
3553
3554 bool CCMatch = CallerCC == CalleeCC;
3555
3557 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3558 return true;
3559 return false;
3560 }
3561
3562 // TODO: Can we handle var args?
3563 if (IsVarArg)
3564 return false;
3565
3566 for (const Argument &Arg : CallerF.args()) {
3567 if (Arg.hasByValAttr())
3568 return false;
3569 }
3570
3571 LLVMContext &Ctx = *DAG.getContext();
3572
3573 // Check that the call results are passed in the same way.
3574 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3575 CCAssignFnForCall(CalleeCC, IsVarArg),
3576 CCAssignFnForCall(CallerCC, IsVarArg)))
3577 return false;
3578
3579 // The callee has to preserve all registers the caller needs to preserve.
3580 if (!CCMatch) {
3581 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3582 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3583 return false;
3584 }
3585
3586 // Nothing more to check if the callee is taking no arguments.
3587 if (Outs.empty())
3588 return true;
3589
3591 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3592
3593 // FIXME: We are not allocating special input registers, so we will be
3594 // deciding based on incorrect register assignments.
3595 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3596
3597 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3598 // If the stack arguments for this call do not fit into our own save area then
3599 // the call cannot be made tail.
3600 // TODO: Is this really necessary?
3601 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3602 return false;
3603
3604 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3605 // FIXME: What about inreg arguments that end up passed in memory?
3606 if (!CCVA.isRegLoc())
3607 continue;
3608
3609 // If we are passing an argument in an SGPR, and the value is divergent,
3610 // this call requires a waterfall loop.
3611 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3612 LLVM_DEBUG(
3613 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3614 << printReg(CCVA.getLocReg(), TRI) << '\n');
3615 return false;
3616 }
3617 }
3618
3619 const MachineRegisterInfo &MRI = MF.getRegInfo();
3620 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3621}
3622
3624 if (!CI->isTailCall())
3625 return false;
3626
3627 const Function *ParentFn = CI->getParent()->getParent();
3629 return false;
3630 return true;
3631}
3632
3633// The wave scratch offset register is used as the global base pointer.
3635 SmallVectorImpl<SDValue> &InVals) const {
3636 CallingConv::ID CallConv = CLI.CallConv;
3637 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3638
3639 SelectionDAG &DAG = CLI.DAG;
3640
3641 TargetLowering::ArgListEntry RequestedExec;
3642 if (IsChainCallConv) {
3643 // The last argument should be the value that we need to put in EXEC.
3644 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3645 // don't treat it like the rest of the arguments.
3646 RequestedExec = CLI.Args.back();
3647 assert(RequestedExec.Node && "No node for EXEC");
3648
3649 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3650 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3651
3652 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3653 CLI.Outs.pop_back();
3654 CLI.OutVals.pop_back();
3655
3656 if (RequestedExec.Ty->isIntegerTy(64)) {
3657 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3658 CLI.Outs.pop_back();
3659 CLI.OutVals.pop_back();
3660 }
3661
3662 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3663 "Haven't popped all the pieces of the EXEC mask");
3664 }
3665
3666 const SDLoc &DL = CLI.DL;
3668 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3670 SDValue Chain = CLI.Chain;
3671 SDValue Callee = CLI.Callee;
3672 bool &IsTailCall = CLI.IsTailCall;
3673 bool IsVarArg = CLI.IsVarArg;
3674 bool IsSibCall = false;
3676
3677 if (Callee.isUndef() || isNullConstant(Callee)) {
3678 if (!CLI.IsTailCall) {
3679 for (ISD::InputArg &Arg : CLI.Ins)
3680 InVals.push_back(DAG.getUNDEF(Arg.VT));
3681 }
3682
3683 return Chain;
3684 }
3685
3686 if (IsVarArg) {
3687 return lowerUnhandledCall(CLI, InVals,
3688 "unsupported call to variadic function ");
3689 }
3690
3691 if (!CLI.CB)
3692 report_fatal_error("unsupported libcall legalization");
3693
3694 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3695 return lowerUnhandledCall(CLI, InVals,
3696 "unsupported required tail call to function ");
3697 }
3698
3699 if (IsTailCall) {
3700 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3701 Outs, OutVals, Ins, DAG);
3702 if (!IsTailCall &&
3703 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3704 report_fatal_error("failed to perform tail call elimination on a call "
3705 "site marked musttail or on llvm.amdgcn.cs.chain");
3706 }
3707
3708 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3709
3710 // A sibling call is one where we're under the usual C ABI and not planning
3711 // to change that but can still do a tail call:
3712 if (!TailCallOpt && IsTailCall)
3713 IsSibCall = true;
3714
3715 if (IsTailCall)
3716 ++NumTailCalls;
3717 }
3718
3721 SmallVector<SDValue, 8> MemOpChains;
3722
3723 // Analyze operands of the call, assigning locations to each operand.
3725 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3726 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3727
3728 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3729 // With a fixed ABI, allocate fixed registers before user arguments.
3730 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3731 }
3732
3733 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3734
3735 // Get a count of how many bytes are to be pushed on the stack.
3736 unsigned NumBytes = CCInfo.getStackSize();
3737
3738 if (IsSibCall) {
3739 // Since we're not changing the ABI to make this a tail call, the memory
3740 // operands are already available in the caller's incoming argument space.
3741 NumBytes = 0;
3742 }
3743
3744 // FPDiff is the byte offset of the call's argument area from the callee's.
3745 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3746 // by this amount for a tail call. In a sibling call it must be 0 because the
3747 // caller will deallocate the entire stack and the callee still expects its
3748 // arguments to begin at SP+0. Completely unused for non-tail calls.
3749 int32_t FPDiff = 0;
3750 MachineFrameInfo &MFI = MF.getFrameInfo();
3751 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3752
3753 // Adjust the stack pointer for the new arguments...
3754 // These operations are automatically eliminated by the prolog/epilog pass
3755 if (!IsSibCall)
3756 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3757
3758 if (!IsSibCall || IsChainCallConv) {
3759 if (!Subtarget->enableFlatScratch()) {
3760 SmallVector<SDValue, 4> CopyFromChains;
3761
3762 // In the HSA case, this should be an identity copy.
3763 SDValue ScratchRSrcReg =
3764 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3765 RegsToPass.emplace_back(IsChainCallConv
3766 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3767 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3768 ScratchRSrcReg);
3769 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3770 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3771 }
3772 }
3773
3774 const unsigned NumSpecialInputs = RegsToPass.size();
3775
3776 MVT PtrVT = MVT::i32;
3777
3778 // Walk the register/memloc assignments, inserting copies/loads.
3779 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3780 CCValAssign &VA = ArgLocs[i];
3781 SDValue Arg = OutVals[i];
3782
3783 // Promote the value if needed.
3784 switch (VA.getLocInfo()) {
3785 case CCValAssign::Full:
3786 break;
3787 case CCValAssign::BCvt:
3788 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3789 break;
3790 case CCValAssign::ZExt:
3791 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3792 break;
3793 case CCValAssign::SExt:
3794 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3795 break;
3796 case CCValAssign::AExt:
3797 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3798 break;
3799 case CCValAssign::FPExt:
3800 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3801 break;
3802 default:
3803 llvm_unreachable("Unknown loc info!");
3804 }
3805
3806 if (VA.isRegLoc()) {
3807 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3808 } else {
3809 assert(VA.isMemLoc());
3810
3811 SDValue DstAddr;
3812 MachinePointerInfo DstInfo;
3813
3814 unsigned LocMemOffset = VA.getLocMemOffset();
3815 int32_t Offset = LocMemOffset;
3816
3817 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3818 MaybeAlign Alignment;
3819
3820 if (IsTailCall) {
3821 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3822 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3823 : VA.getValVT().getStoreSize();
3824
3825 // FIXME: We can have better than the minimum byval required alignment.
3826 Alignment =
3827 Flags.isByVal()
3828 ? Flags.getNonZeroByValAlign()
3829 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3830
3831 Offset = Offset + FPDiff;
3832 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3833
3834 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3835 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3836
3837 // Make sure any stack arguments overlapping with where we're storing
3838 // are loaded before this eventual operation. Otherwise they'll be
3839 // clobbered.
3840
3841 // FIXME: Why is this really necessary? This seems to just result in a
3842 // lot of code to copy the stack and write them back to the same
3843 // locations, which are supposed to be immutable?
3844 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3845 } else {
3846 // Stores to the argument stack area are relative to the stack pointer.
3847 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3848 MVT::i32);
3849 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3850 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3851 Alignment =
3852 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3853 }
3854
3855 if (Outs[i].Flags.isByVal()) {
3856 SDValue SizeNode =
3857 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3858 SDValue Cpy =
3859 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3860 Outs[i].Flags.getNonZeroByValAlign(),
3861 /*isVol = */ false, /*AlwaysInline = */ true,
3862 /*CI=*/nullptr, std::nullopt, DstInfo,
3864
3865 MemOpChains.push_back(Cpy);
3866 } else {
3867 SDValue Store =
3868 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3869 MemOpChains.push_back(Store);
3870 }
3871 }
3872 }
3873
3874 if (!MemOpChains.empty())
3875 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3876
3877 SDValue ReadFirstLaneID =
3878 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3879
3880 SDValue TokenGlue;
3881 if (CLI.ConvergenceControlToken) {
3882 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3884 }
3885
3886 // Build a sequence of copy-to-reg nodes chained together with token chain
3887 // and flag operands which copy the outgoing args into the appropriate regs.
3888 SDValue InGlue;
3889
3890 unsigned ArgIdx = 0;
3891 for (auto [Reg, Val] : RegsToPass) {
3892 if (ArgIdx++ >= NumSpecialInputs &&
3893 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3894 // For chain calls, the inreg arguments are required to be
3895 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3896 // they are uniform.
3897 //
3898 // For other calls, if an inreg arguments is known to be uniform,
3899 // speculatively insert a readfirstlane in case it is in a VGPR.
3900 //
3901 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3902 // value, so let that continue to produce invalid code.
3903
3904 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3905 if (TokenGlue)
3906 ReadfirstlaneArgs.push_back(TokenGlue);
3908 ReadfirstlaneArgs);
3909 }
3910
3911 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3912 InGlue = Chain.getValue(1);
3913 }
3914
3915 // We don't usually want to end the call-sequence here because we would tidy
3916 // the frame up *after* the call, however in the ABI-changing tail-call case
3917 // we've carefully laid out the parameters so that when sp is reset they'll be
3918 // in the correct location.
3919 if (IsTailCall && !IsSibCall) {
3920 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3921 InGlue = Chain.getValue(1);
3922 }
3923
3924 std::vector<SDValue> Ops({Chain});
3925
3926 // Add a redundant copy of the callee global which will not be legalized, as
3927 // we need direct access to the callee later.
3928 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3929 const GlobalValue *GV = GSD->getGlobal();
3930 Ops.push_back(Callee);
3931 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3932 } else {
3933 if (IsTailCall) {
3934 // isEligibleForTailCallOptimization considered whether the call target is
3935 // divergent, but we may still end up with a uniform value in a VGPR.
3936 // Insert a readfirstlane just in case.
3937 SDValue ReadFirstLaneID =
3938 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3939
3940 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3941 if (TokenGlue)
3942 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3943 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3944 ReadfirstlaneArgs);
3945 }
3946
3947 Ops.push_back(Callee);
3948 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3949 }
3950
3951 if (IsTailCall) {
3952 // Each tail call may have to adjust the stack by a different amount, so
3953 // this information must travel along with the operation for eventual
3954 // consumption by emitEpilogue.
3955 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3956 }
3957
3958 if (IsChainCallConv)
3959 Ops.push_back(RequestedExec.Node);
3960
3961 // Add argument registers to the end of the list so that they are known live
3962 // into the call.
3963 for (auto &[Reg, Val] : RegsToPass)
3964 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3965
3966 // Add a register mask operand representing the call-preserved registers.
3967 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3968 assert(Mask && "Missing call preserved mask for calling convention");
3969 Ops.push_back(DAG.getRegisterMask(Mask));
3970
3971 if (SDValue Token = CLI.ConvergenceControlToken) {
3973 GlueOps.push_back(Token);
3974 if (InGlue)
3975 GlueOps.push_back(InGlue);
3976
3977 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3978 MVT::Glue, GlueOps),
3979 0);
3980 }
3981
3982 if (InGlue)
3983 Ops.push_back(InGlue);
3984
3985 // If we're doing a tall call, use a TC_RETURN here rather than an
3986 // actual call instruction.
3987 if (IsTailCall) {
3988 MFI.setHasTailCall();
3989 unsigned OPC = AMDGPUISD::TC_RETURN;
3990 switch (CallConv) {
3993 break;
3997 break;
3998 }
3999
4000 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4001 }
4002
4003 // Returns a chain and a flag for retval copy to use.
4004 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4005 Chain = Call.getValue(0);
4006 InGlue = Call.getValue(1);
4007
4008 uint64_t CalleePopBytes = NumBytes;
4009 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4010 if (!Ins.empty())
4011 InGlue = Chain.getValue(1);
4012
4013 // Handle result values, copying them out of physregs into vregs that we
4014 // return.
4015 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4016 InVals, /*IsThisReturn=*/false, SDValue());
4017}
4018
4019// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020// except for applying the wave size scale to the increment amount.
4022 SelectionDAG &DAG) const {
4023 const MachineFunction &MF = DAG.getMachineFunction();
4025
4026 SDLoc dl(Op);
4027 EVT VT = Op.getValueType();
4028 SDValue Tmp1 = Op;
4029 SDValue Tmp2 = Op.getValue(1);
4030 SDValue Tmp3 = Op.getOperand(2);
4031 SDValue Chain = Tmp1.getOperand(0);
4032
4033 Register SPReg = Info->getStackPtrOffsetReg();
4034
4035 // Chain the dynamic stack allocation so that it doesn't modify the stack
4036 // pointer when other instructions are using the stack.
4037 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4038
4039 SDValue Size = Tmp2.getOperand(1);
4040 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4041 Chain = SP.getValue(1);
4042 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
4043 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4045 "Stack grows upwards for AMDGPU");
4046
4047 SDValue ScaledSize = DAG.getNode(
4048 ISD::SHL, dl, VT, Size,
4049 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4050
4051 Align StackAlign = TFL->getStackAlign();
4052 Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SP, ScaledSize); // Value
4053 if (Alignment && *Alignment > StackAlign) {
4054 Tmp1 = DAG.getNode(
4055 ISD::AND, dl, VT, Tmp1,
4056 DAG.getSignedConstant(-(uint64_t)Alignment->value()
4057 << Subtarget->getWavefrontSizeLog2(),
4058 dl, VT));
4059 }
4060
4061 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4062 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4063
4064 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4065}
4066
4068 SelectionDAG &DAG) const {
4069 // We only handle constant sizes here to allow non-entry block, static sized
4070 // allocas. A truly dynamic value is more difficult to support because we
4071 // don't know if the size value is uniform or not. If the size isn't uniform,
4072 // we would need to do a wave reduction to get the maximum size to know how
4073 // much to increment the uniform stack pointer.
4074 SDValue Size = Op.getOperand(1);
4075 if (isa<ConstantSDNode>(Size))
4076 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4077
4079}
4080
4082 if (Op.getValueType() != MVT::i32)
4083 return Op; // Defer to cannot select error.
4084
4086 SDLoc SL(Op);
4087
4088 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4089
4090 // Convert from wave uniform to swizzled vector address. This should protect
4091 // from any edge cases where the stacksave result isn't directly used with
4092 // stackrestore.
4093 SDValue VectorAddress =
4094 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4095 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4096}
4097
4099 SelectionDAG &DAG) const {
4100 SDLoc SL(Op);
4101 assert(Op.getValueType() == MVT::i32);
4102
4103 uint32_t BothRoundHwReg =
4105 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4106
4107 SDValue IntrinID =
4108 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4109 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4110 Op.getOperand(0), IntrinID, GetRoundBothImm);
4111
4112 // There are two rounding modes, one for f32 and one for f64/f16. We only
4113 // report in the standard value range if both are the same.
4114 //
4115 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4116 // ties away from zero is not supported, and the other values are rotated by
4117 // 1.
4118 //
4119 // If the two rounding modes are not the same, report a target defined value.
4120
4121 // Mode register rounding mode fields:
4122 //
4123 // [1:0] Single-precision round mode.
4124 // [3:2] Double/Half-precision round mode.
4125 //
4126 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4127 //
4128 // Hardware Spec
4129 // Toward-0 3 0
4130 // Nearest Even 0 1
4131 // +Inf 1 2
4132 // -Inf 2 3
4133 // NearestAway0 N/A 4
4134 //
4135 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4136 // table we can index by the raw hardware mode.
4137 //
4138 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4139
4140 SDValue BitTable =
4142
4143 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4144 SDValue RoundModeTimesNumBits =
4145 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4146
4147 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4148 // knew only one mode was demanded.
4149 SDValue TableValue =
4150 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4151 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4152
4153 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4154 SDValue TableEntry =
4155 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4156
4157 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4158 // if it's an extended value.
4159 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4160 SDValue IsStandardValue =
4161 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4162 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4163 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4164 TableEntry, EnumOffset);
4165
4166 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4167}
4168
4170 SelectionDAG &DAG) const {
4171 SDLoc SL(Op);
4172
4173 SDValue NewMode = Op.getOperand(1);
4174 assert(NewMode.getValueType() == MVT::i32);
4175
4176 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4177 // hardware MODE.fp_round values.
4178 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4179 uint32_t ClampedVal = std::min(
4180 static_cast<uint32_t>(ConstMode->getZExtValue()),
4182 NewMode = DAG.getConstant(
4183 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4184 } else {
4185 // If we know the input can only be one of the supported standard modes in
4186 // the range 0-3, we can use a simplified mapping to hardware values.
4187 KnownBits KB = DAG.computeKnownBits(NewMode);
4188 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4189 // The supported standard values are 0-3. The extended values start at 8. We
4190 // need to offset by 4 if the value is in the extended range.
4191
4192 if (UseReducedTable) {
4193 // Truncate to the low 32-bits.
4194 SDValue BitTable = DAG.getConstant(
4195 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4196
4197 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4198 SDValue RoundModeTimesNumBits =
4199 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4200
4201 NewMode =
4202 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4203
4204 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4205 // the table extracted bits into inline immediates.
4206 } else {
4207 // table_index = umin(value, value - 4)
4208 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4209 SDValue BitTable =
4211
4212 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4213 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4214 SDValue IndexVal =
4215 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4216
4217 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4218 SDValue RoundModeTimesNumBits =
4219 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4220
4221 SDValue TableValue =
4222 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4223 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4224
4225 // No need to mask out the high bits since the setreg will ignore them
4226 // anyway.
4227 NewMode = TruncTable;
4228 }
4229
4230 // Insert a readfirstlane in case the value is a VGPR. We could do this
4231 // earlier and keep more operations scalar, but that interferes with
4232 // combining the source.
4233 SDValue ReadFirstLaneID =
4234 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4235 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4236 ReadFirstLaneID, NewMode);
4237 }
4238
4239 // N.B. The setreg will be later folded into s_round_mode on supported
4240 // targets.
4241 SDValue IntrinID =
4242 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4243 uint32_t BothRoundHwReg =
4245 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4246
4247 SDValue SetReg =
4248 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4249 IntrinID, RoundBothImm, NewMode);
4250
4251 return SetReg;
4252}
4253
4255 if (Op->isDivergent())
4256 return SDValue();
4257
4258 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4263 break;
4264 default:
4265 return SDValue();
4266 }
4267
4268 return Op;
4269}
4270
4271// Work around DAG legality rules only based on the result type.
4273 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4274 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4275 EVT SrcVT = Src.getValueType();
4276
4277 if (SrcVT.getScalarType() != MVT::bf16)
4278 return Op;
4279
4280 SDLoc SL(Op);
4281 SDValue BitCast =
4282 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4283
4284 EVT DstVT = Op.getValueType();
4285 if (IsStrict)
4286 llvm_unreachable("Need STRICT_BF16_TO_FP");
4287
4288 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4289}
4290
4292 SDLoc SL(Op);
4293 if (Op.getValueType() != MVT::i64)
4294 return Op;
4295
4296 uint32_t ModeHwReg =
4298 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4299 uint32_t TrapHwReg =
4301 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4302
4303 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4304 SDValue IntrinID =
4305 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4306 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4307 Op.getOperand(0), IntrinID, ModeHwRegImm);
4308 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4309 Op.getOperand(0), IntrinID, TrapHwRegImm);
4310 SDValue TokenReg =
4311 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4312 GetTrapReg.getValue(1));
4313
4314 SDValue CvtPtr =
4315 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4316 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4317
4318 return DAG.getMergeValues({Result, TokenReg}, SL);
4319}
4320
4322 SDLoc SL(Op);
4323 if (Op.getOperand(1).getValueType() != MVT::i64)
4324 return Op;
4325
4326 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4327 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4328 DAG.getConstant(0, SL, MVT::i32));
4329 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4330 DAG.getConstant(1, SL, MVT::i32));
4331
4332 SDValue ReadFirstLaneID =
4333 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4334 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4335 ReadFirstLaneID, NewModeReg);
4336 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4337 ReadFirstLaneID, NewTrapReg);
4338
4339 unsigned ModeHwReg =
4341 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4342 unsigned TrapHwReg =
4344 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4345
4346 SDValue IntrinID =
4347 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4348 SDValue SetModeReg =
4349 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4350 IntrinID, ModeHwRegImm, NewModeReg);
4351 SDValue SetTrapReg =
4352 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4353 IntrinID, TrapHwRegImm, NewTrapReg);
4354 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4355}
4356
4358 const MachineFunction &MF) const {
4360 .Case("m0", AMDGPU::M0)
4361 .Case("exec", AMDGPU::EXEC)
4362 .Case("exec_lo", AMDGPU::EXEC_LO)
4363 .Case("exec_hi", AMDGPU::EXEC_HI)
4364 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4365 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4366 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4367 .Default(Register());
4368
4369 if (Reg == AMDGPU::NoRegister) {
4371 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4372 }
4373
4374 if (!Subtarget->hasFlatScrRegister() &&
4375 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4376 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4377 "\" for subtarget."));
4378 }
4379
4380 switch (Reg) {
4381 case AMDGPU::M0:
4382 case AMDGPU::EXEC_LO:
4383 case AMDGPU::EXEC_HI:
4384 case AMDGPU::FLAT_SCR_LO:
4385 case AMDGPU::FLAT_SCR_HI:
4386 if (VT.getSizeInBits() == 32)
4387 return Reg;
4388 break;
4389 case AMDGPU::EXEC:
4390 case AMDGPU::FLAT_SCR:
4391 if (VT.getSizeInBits() == 64)
4392 return Reg;
4393 break;
4394 default:
4395 llvm_unreachable("missing register type checking");
4396 }
4397
4399 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4400}
4401
4402// If kill is not the last instruction, split the block so kill is always a
4403// proper terminator.
4406 MachineBasicBlock *BB) const {
4407 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4409 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4410 return SplitBB;
4411}
4412
4413// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4414// \p MI will be the only instruction in the loop body block. Otherwise, it will
4415// be the first instruction in the remainder block.
4416//
4417/// \returns { LoopBody, Remainder }
4418static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4422
4423 // To insert the loop we need to split the block. Move everything after this
4424 // point to a new block, and insert a new empty block between the two.
4426 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4428 ++MBBI;
4429
4430 MF->insert(MBBI, LoopBB);
4431 MF->insert(MBBI, RemainderBB);
4432
4433 LoopBB->addSuccessor(LoopBB);
4434 LoopBB->addSuccessor(RemainderBB);
4435
4436 // Move the rest of the block into a new block.
4437 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4438
4439 if (InstInLoop) {
4440 auto Next = std::next(I);
4441
4442 // Move instruction to loop body.
4443 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4444
4445 // Move the rest of the block.
4446 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4447 } else {
4448 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4449 }
4450
4451 MBB.addSuccessor(LoopBB);
4452
4453 return std::pair(LoopBB, RemainderBB);
4454}
4455
4456/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4458 MachineBasicBlock *MBB = MI.getParent();
4460 auto I = MI.getIterator();
4461 auto E = std::next(I);
4462
4463 // clang-format off
4464 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4465 .addImm(0);
4466 // clang-format on
4467
4468 MIBundleBuilder Bundler(*MBB, I, E);
4469 finalizeBundle(*MBB, Bundler.begin());
4470}
4471
4474 MachineBasicBlock *BB) const {
4475 const DebugLoc &DL = MI.getDebugLoc();
4476
4478
4480
4481 // Apparently kill flags are only valid if the def is in the same block?
4482 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4483 Src->setIsKill(false);
4484
4485 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4486
4487 MachineBasicBlock::iterator I = LoopBB->end();
4488
4489 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4491
4492 // Clear TRAP_STS.MEM_VIOL
4493 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4494 .addImm(0)
4495 .addImm(EncodedReg);
4496
4498
4499 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4500
4501 // Load and check TRAP_STS.MEM_VIOL
4502 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4503 .addImm(EncodedReg);
4504
4505 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4506 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4507 .addReg(Reg, RegState::Kill)
4508 .addImm(0);
4509 // clang-format off
4510 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4511 .addMBB(LoopBB);
4512 // clang-format on
4513
4514 return RemainderBB;
4515}
4516
4517// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4518// wavefront. If the value is uniform and just happens to be in a VGPR, this
4519// will only do one iteration. In the worst case, this will loop 64 times.
4520//
4521// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4524 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4525 const DebugLoc &DL, const MachineOperand &Idx,
4526 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4527 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4528 Register &SGPRIdxReg) {
4529
4530 MachineFunction *MF = OrigBB.getParent();
4531 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4532 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4534
4535 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4536 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4537 Register NewExec = MRI.createVirtualRegister(BoolRC);
4538 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4539 Register CondReg = MRI.createVirtualRegister(BoolRC);
4540
4541 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4542 .addReg(InitReg)
4543 .addMBB(&OrigBB)
4544 .addReg(ResultReg)
4545 .addMBB(&LoopBB);
4546
4547 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4548 .addReg(InitSaveExecReg)
4549 .addMBB(&OrigBB)
4550 .addReg(NewExec)
4551 .addMBB(&LoopBB);
4552
4553 // Read the next variant <- also loop target.
4554 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4555 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4556
4557 // Compare the just read M0 value to all possible Idx values.
4558 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4559 .addReg(CurrentIdxReg)
4560 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4561
4562 // Update EXEC, save the original EXEC value to VCC.
4563 BuildMI(LoopBB, I, DL,
4564 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4565 : AMDGPU::S_AND_SAVEEXEC_B64),
4566 NewExec)
4567 .addReg(CondReg, RegState::Kill);
4568
4569 MRI.setSimpleHint(NewExec, CondReg);
4570
4571 if (UseGPRIdxMode) {
4572 if (Offset == 0) {
4573 SGPRIdxReg = CurrentIdxReg;
4574 } else {
4575 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4576 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4577 .addReg(CurrentIdxReg, RegState::Kill)
4578 .addImm(Offset);
4579 }
4580 } else {
4581 // Move index from VCC into M0
4582 if (Offset == 0) {
4583 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4584 .addReg(CurrentIdxReg, RegState::Kill);
4585 } else {
4586 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4587 .addReg(CurrentIdxReg, RegState::Kill)
4588 .addImm(Offset);
4589 }
4590 }
4591
4592 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4593 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4594 MachineInstr *InsertPt =
4595 BuildMI(LoopBB, I, DL,
4596 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4597 : AMDGPU::S_XOR_B64_term),
4598 Exec)
4599 .addReg(Exec)
4600 .addReg(NewExec);
4601
4602 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4603 // s_cbranch_scc0?
4604
4605 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4606 // clang-format off
4607 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4608 .addMBB(&LoopBB);
4609 // clang-format on
4610
4611 return InsertPt->getIterator();
4612}
4613
4614// This has slightly sub-optimal regalloc when the source vector is killed by
4615// the read. The register allocator does not understand that the kill is
4616// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4617// subregister from it, using 1 more VGPR than necessary. This was saved when
4618// this was expanded after register allocation.
4621 unsigned InitResultReg, unsigned PhiReg, int Offset,
4622 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4624 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4625 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4627 const DebugLoc &DL = MI.getDebugLoc();
4629
4630 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4631 Register DstReg = MI.getOperand(0).getReg();
4632 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4633 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4634 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4635 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4636
4637 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4638
4639 // Save the EXEC mask
4640 // clang-format off
4641 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4642 .addReg(Exec);
4643 // clang-format on
4644
4645 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4646
4647 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4648
4649 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4650 InitResultReg, DstReg, PhiReg, TmpExec,
4651 Offset, UseGPRIdxMode, SGPRIdxReg);
4652
4653 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4655 ++MBBI;
4656 MF->insert(MBBI, LandingPad);
4657 LoopBB->removeSuccessor(RemainderBB);
4658 LandingPad->addSuccessor(RemainderBB);
4659 LoopBB->addSuccessor(LandingPad);
4660 MachineBasicBlock::iterator First = LandingPad->begin();
4661 // clang-format off
4662 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4663 .addReg(SaveExec);
4664 // clang-format on
4665
4666 return InsPt;
4667}
4668
4669// Returns subreg index, offset
4670static std::pair<unsigned, int>
4672 const TargetRegisterClass *SuperRC, unsigned VecReg,
4673 int Offset) {
4674 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4675
4676 // Skip out of bounds offsets, or else we would end up using an undefined
4677 // register.
4678 if (Offset >= NumElts || Offset < 0)
4679 return std::pair(AMDGPU::sub0, Offset);
4680
4681 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4682}
4683
4686 int Offset) {
4687 MachineBasicBlock *MBB = MI.getParent();
4688 const DebugLoc &DL = MI.getDebugLoc();
4690
4691 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4692
4693 assert(Idx->getReg() != AMDGPU::NoRegister);
4694
4695 if (Offset == 0) {
4696 // clang-format off
4697 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4698 .add(*Idx);
4699 // clang-format on
4700 } else {
4701 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4702 .add(*Idx)
4703 .addImm(Offset);
4704 }
4705}
4706
4709 int Offset) {
4710 MachineBasicBlock *MBB = MI.getParent();
4711 const DebugLoc &DL = MI.getDebugLoc();
4713
4714 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4715
4716 if (Offset == 0)
4717 return Idx->getReg();
4718
4719 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4720 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4721 .add(*Idx)
4722 .addImm(Offset);
4723 return Tmp;
4724}
4725
4728 const GCNSubtarget &ST) {
4729 const SIInstrInfo *TII = ST.getInstrInfo();
4730 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4733
4734 Register Dst = MI.getOperand(0).getReg();
4735 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4736 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4737 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4738
4739 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4740 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4741
4742 unsigned SubReg;
4743 std::tie(SubReg, Offset) =
4744 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4745
4746 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4747
4748 // Check for a SGPR index.
4749 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4751 const DebugLoc &DL = MI.getDebugLoc();
4752
4753 if (UseGPRIdxMode) {
4754 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4755 // to avoid interfering with other uses, so probably requires a new
4756 // optimization pass.
4758
4759 const MCInstrDesc &GPRIDXDesc =
4760 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4761 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4762 .addReg(SrcReg)
4763 .addReg(Idx)
4764 .addImm(SubReg);
4765 } else {
4767
4768 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4769 .addReg(SrcReg, 0, SubReg)
4770 .addReg(SrcReg, RegState::Implicit);
4771 }
4772
4773 MI.eraseFromParent();
4774
4775 return &MBB;
4776 }
4777
4778 // Control flow needs to be inserted if indexing with a VGPR.
4779 const DebugLoc &DL = MI.getDebugLoc();
4781
4782 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4783 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4784
4785 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4786
4787 Register SGPRIdxReg;
4788 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4789 UseGPRIdxMode, SGPRIdxReg);
4790
4791 MachineBasicBlock *LoopBB = InsPt->getParent();
4792
4793 if (UseGPRIdxMode) {
4794 const MCInstrDesc &GPRIDXDesc =
4795 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4796
4797 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4798 .addReg(SrcReg)
4799 .addReg(SGPRIdxReg)
4800 .addImm(SubReg);
4801 } else {
4802 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4803 .addReg(SrcReg, 0, SubReg)
4804 .addReg(SrcReg, RegState::Implicit);
4805 }
4806
4807 MI.eraseFromParent();
4808
4809 return LoopBB;
4810}
4811
4814 const GCNSubtarget &ST) {
4815 const SIInstrInfo *TII = ST.getInstrInfo();
4816 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4819
4820 Register Dst = MI.getOperand(0).getReg();
4821 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4822 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4823 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4824 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4825 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4826 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4827
4828 // This can be an immediate, but will be folded later.
4829 assert(Val->getReg());
4830
4831 unsigned SubReg;
4832 std::tie(SubReg, Offset) =
4833 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4834 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4835
4836 if (Idx->getReg() == AMDGPU::NoRegister) {
4838 const DebugLoc &DL = MI.getDebugLoc();
4839
4840 assert(Offset == 0);
4841
4842 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4843 .add(*SrcVec)
4844 .add(*Val)
4845 .addImm(SubReg);
4846
4847 MI.eraseFromParent();
4848 return &MBB;
4849 }
4850
4851 // Check for a SGPR index.
4852 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4854 const DebugLoc &DL = MI.getDebugLoc();
4855
4856 if (UseGPRIdxMode) {
4858
4859 const MCInstrDesc &GPRIDXDesc =
4860 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4861 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4862 .addReg(SrcVec->getReg())
4863 .add(*Val)
4864 .addReg(Idx)
4865 .addImm(SubReg);
4866 } else {
4868
4869 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4870 TRI.getRegSizeInBits(*VecRC), 32, false);
4871 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4872 .addReg(SrcVec->getReg())
4873 .add(*Val)
4874 .addImm(SubReg);
4875 }
4876 MI.eraseFromParent();
4877 return &MBB;
4878 }
4879
4880 // Control flow needs to be inserted if indexing with a VGPR.
4881 if (Val->isReg())
4882 MRI.clearKillFlags(Val->getReg());
4883
4884 const DebugLoc &DL = MI.getDebugLoc();
4885
4886 Register PhiReg = MRI.createVirtualRegister(VecRC);
4887
4888 Register SGPRIdxReg;
4889 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4890 UseGPRIdxMode, SGPRIdxReg);
4891 MachineBasicBlock *LoopBB = InsPt->getParent();
4892
4893 if (UseGPRIdxMode) {
4894 const MCInstrDesc &GPRIDXDesc =
4895 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4896
4897 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4898 .addReg(PhiReg)
4899 .add(*Val)
4900 .addReg(SGPRIdxReg)
4901 .addImm(SubReg);
4902 } else {
4903 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4904 TRI.getRegSizeInBits(*VecRC), 32, false);
4905 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4906 .addReg(PhiReg)
4907 .add(*Val)
4908 .addImm(SubReg);
4909 }
4910
4911 MI.eraseFromParent();
4912 return LoopBB;
4913}
4914
4917 const GCNSubtarget &ST,
4918 unsigned Opc) {
4920 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4921 const DebugLoc &DL = MI.getDebugLoc();
4922 const SIInstrInfo *TII = ST.getInstrInfo();
4923
4924 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4925 Register SrcReg = MI.getOperand(1).getReg();
4926 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4927 Register DstReg = MI.getOperand(0).getReg();
4928 MachineBasicBlock *RetBB = nullptr;
4929 if (isSGPR) {
4930 // These operations with a uniform value i.e. SGPR are idempotent.
4931 // Reduced value will be same as given sgpr.
4932 // clang-format off
4933 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4934 .addReg(SrcReg);
4935 // clang-format on
4936 RetBB = &BB;
4937 } else {
4938 // TODO: Implement DPP Strategy and switch based on immediate strategy
4939 // operand. For now, for all the cases (default, Iterative and DPP we use
4940 // iterative approach by default.)
4941
4942 // To reduce the VGPR using iterative approach, we need to iterate
4943 // over all the active lanes. Lowering consists of ComputeLoop,
4944 // which iterate over only active lanes. We use copy of EXEC register
4945 // as induction variable and every active lane modifies it using bitset0
4946 // so that we will get the next active lane for next iteration.
4948 Register SrcReg = MI.getOperand(1).getReg();
4949
4950 // Create Control flow for loop
4951 // Split MI's Machine Basic block into For loop
4952 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4953
4954 // Create virtual registers required for lowering.
4955 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4956 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4957 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4958 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4959
4960 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4961 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4962 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4963
4964 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4965 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4966
4967 bool IsWave32 = ST.isWave32();
4968 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4969 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4970
4971 // Create initail values of induction variable from Exec, Accumulator and
4972 // insert branch instr to newly created ComputeBlockk
4973 uint32_t InitalValue =
4974 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4975 auto TmpSReg =
4976 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4977 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4978 .addImm(InitalValue);
4979 // clang-format off
4980 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
4981 .addMBB(ComputeLoop);
4982 // clang-format on
4983
4984 // Start constructing ComputeLoop
4985 I = ComputeLoop->end();
4986 auto Accumulator =
4987 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4988 .addReg(InitalValReg)
4989 .addMBB(&BB);
4990 auto ActiveBits =
4991 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4992 .addReg(TmpSReg->getOperand(0).getReg())
4993 .addMBB(&BB);
4994
4995 // Perform the computations
4996 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4997 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4998 .addReg(ActiveBits->getOperand(0).getReg());
4999 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5000 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5001 .addReg(SrcReg)
5002 .addReg(FF1->getOperand(0).getReg());
5003 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5004 .addReg(Accumulator->getOperand(0).getReg())
5005 .addReg(LaneValue->getOperand(0).getReg());
5006
5007 // Manipulate the iterator to get the next active lane
5008 unsigned BITSETOpc =
5009 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5010 auto NewActiveBits =
5011 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5012 .addReg(FF1->getOperand(0).getReg())
5013 .addReg(ActiveBits->getOperand(0).getReg());
5014
5015 // Add phi nodes
5016 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5017 .addMBB(ComputeLoop);
5018 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5019 .addMBB(ComputeLoop);
5020
5021 // Creating branching
5022 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5023 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5024 .addReg(NewActiveBits->getOperand(0).getReg())
5025 .addImm(0);
5026 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5027 .addMBB(ComputeLoop);
5028
5029 RetBB = ComputeEnd;
5030 }
5031 MI.eraseFromParent();
5032 return RetBB;
5033}
5034
5037 MachineBasicBlock *BB) const {
5038
5040 MachineFunction *MF = BB->getParent();
5042
5043 switch (MI.getOpcode()) {
5044 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5045 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5046 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5047 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5048 case AMDGPU::S_UADDO_PSEUDO:
5049 case AMDGPU::S_USUBO_PSEUDO: {
5050 const DebugLoc &DL = MI.getDebugLoc();
5051 MachineOperand &Dest0 = MI.getOperand(0);
5052 MachineOperand &Dest1 = MI.getOperand(1);
5053 MachineOperand &Src0 = MI.getOperand(2);
5054 MachineOperand &Src1 = MI.getOperand(3);
5055
5056 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5057 ? AMDGPU::S_ADD_I32
5058 : AMDGPU::S_SUB_I32;
5059 // clang-format off
5060 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5061 .add(Src0)
5062 .add(Src1);
5063 // clang-format on
5064
5065 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5066 .addImm(1)
5067 .addImm(0);
5068
5069 MI.eraseFromParent();
5070 return BB;
5071 }
5072 case AMDGPU::S_ADD_U64_PSEUDO:
5073 case AMDGPU::S_SUB_U64_PSEUDO: {
5074 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5075 // For GFX12, we emit s_add_u64 and s_sub_u64.
5076 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5078 const DebugLoc &DL = MI.getDebugLoc();
5079 MachineOperand &Dest = MI.getOperand(0);
5080 MachineOperand &Src0 = MI.getOperand(1);
5081 MachineOperand &Src1 = MI.getOperand(2);
5082 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5083 if (Subtarget->hasScalarAddSub64()) {
5084 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5085 // clang-format off
5086 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5087 .add(Src0)
5088 .add(Src1);
5089 // clang-format on
5090 } else {
5091 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5092 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5093
5094 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5095 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5096
5097 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5098 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5099 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5100 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5101
5102 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5103 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5104 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5105 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5106
5107 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5108 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5109 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5110 .add(Src0Sub0)
5111 .add(Src1Sub0);
5112 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5113 .add(Src0Sub1)
5114 .add(Src1Sub1);
5115 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5116 .addReg(DestSub0)
5117 .addImm(AMDGPU::sub0)
5118 .addReg(DestSub1)
5119 .addImm(AMDGPU::sub1);
5120 }
5121 MI.eraseFromParent();
5122 return BB;
5123 }
5124 case AMDGPU::V_ADD_U64_PSEUDO:
5125 case AMDGPU::V_SUB_U64_PSEUDO: {
5127 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5128 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5129 const DebugLoc &DL = MI.getDebugLoc();
5130
5131 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5132
5133 MachineOperand &Dest = MI.getOperand(0);
5134 MachineOperand &Src0 = MI.getOperand(1);
5135 MachineOperand &Src1 = MI.getOperand(2);
5136
5137 if (IsAdd && ST.hasLshlAddB64()) {
5138 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5139 Dest.getReg())
5140 .add(Src0)
5141 .addImm(0)
5142 .add(Src1);
5143 TII->legalizeOperands(*Add);
5144 MI.eraseFromParent();
5145 return BB;
5146 }
5147
5148 const auto *CarryRC = TRI->getWaveMaskRegClass();
5149
5150 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5151 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5152
5153 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5154 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5155
5156 const TargetRegisterClass *Src0RC = Src0.isReg()
5157 ? MRI.getRegClass(Src0.getReg())
5158 : &AMDGPU::VReg_64RegClass;
5159 const TargetRegisterClass *Src1RC = Src1.isReg()
5160 ? MRI.getRegClass(Src1.getReg())
5161 : &AMDGPU::VReg_64RegClass;
5162
5163 const TargetRegisterClass *Src0SubRC =
5164 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5165 const TargetRegisterClass *Src1SubRC =
5166 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5167
5168 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5169 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5170 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5171 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5172
5173 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5174 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5175 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5176 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5177
5178 unsigned LoOpc =
5179 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5180 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5181 .addReg(CarryReg, RegState::Define)
5182 .add(SrcReg0Sub0)
5183 .add(SrcReg1Sub0)
5184 .addImm(0); // clamp bit
5185
5186 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5187 MachineInstr *HiHalf =
5188 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5189 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5190 .add(SrcReg0Sub1)
5191 .add(SrcReg1Sub1)
5192 .addReg(CarryReg, RegState::Kill)
5193 .addImm(0); // clamp bit
5194
5195 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5196 .addReg(DestSub0)
5197 .addImm(AMDGPU::sub0)
5198 .addReg(DestSub1)
5199 .addImm(AMDGPU::sub1);
5200 TII->legalizeOperands(*LoHalf);
5201 TII->legalizeOperands(*HiHalf);
5202 MI.eraseFromParent();
5203 return BB;
5204 }
5205 case AMDGPU::S_ADD_CO_PSEUDO:
5206 case AMDGPU::S_SUB_CO_PSEUDO: {
5207 // This pseudo has a chance to be selected
5208 // only from uniform add/subcarry node. All the VGPR operands
5209 // therefore assumed to be splat vectors.
5211 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5212 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5214 const DebugLoc &DL = MI.getDebugLoc();
5215 MachineOperand &Dest = MI.getOperand(0);
5216 MachineOperand &CarryDest = MI.getOperand(1);
5217 MachineOperand &Src0 = MI.getOperand(2);
5218 MachineOperand &Src1 = MI.getOperand(3);
5219 MachineOperand &Src2 = MI.getOperand(4);
5220 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5221 ? AMDGPU::S_ADDC_U32
5222 : AMDGPU::S_SUBB_U32;
5223 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5224 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5225 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5226 .addReg(Src0.getReg());
5227 Src0.setReg(RegOp0);
5228 }
5229 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5230 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5231 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5232 .addReg(Src1.getReg());
5233 Src1.setReg(RegOp1);
5234 }
5235 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5236 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5237 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5238 .addReg(Src2.getReg());
5239 Src2.setReg(RegOp2);
5240 }
5241
5242 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5243 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5244 assert(WaveSize == 64 || WaveSize == 32);
5245
5246 if (WaveSize == 64) {
5247 if (ST.hasScalarCompareEq64()) {
5248 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5249 .addReg(Src2.getReg())
5250 .addImm(0);
5251 } else {
5252 const TargetRegisterClass *SubRC =
5253 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5254 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5255 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5256 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5257 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5258 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5259
5260 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5261 .add(Src2Sub0)
5262 .add(Src2Sub1);
5263
5264 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5265 .addReg(Src2_32, RegState::Kill)
5266 .addImm(0);
5267 }
5268 } else {
5269 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5270 .addReg(Src2.getReg())
5271 .addImm(0);
5272 }
5273
5274 // clang-format off
5275 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5276 .add(Src0)
5277 .add(Src1);
5278 // clang-format on
5279
5280 unsigned SelOpc =
5281 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5282
5283 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5284 .addImm(-1)
5285 .addImm(0);
5286
5287 MI.eraseFromParent();
5288 return BB;
5289 }
5290 case AMDGPU::SI_INIT_M0: {
5291 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5292 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5293 .add(MI.getOperand(0));
5294 MI.eraseFromParent();
5295 return BB;
5296 }
5297 case AMDGPU::GET_GROUPSTATICSIZE: {
5298 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5299 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5300 DebugLoc DL = MI.getDebugLoc();
5301 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5302 .add(MI.getOperand(0))
5303 .addImm(MFI->getLDSSize());
5304 MI.eraseFromParent();
5305 return BB;
5306 }
5307 case AMDGPU::GET_SHADERCYCLESHILO: {
5310 const DebugLoc &DL = MI.getDebugLoc();
5311 // The algorithm is:
5312 //
5313 // hi1 = getreg(SHADER_CYCLES_HI)
5314 // lo1 = getreg(SHADER_CYCLES_LO)
5315 // hi2 = getreg(SHADER_CYCLES_HI)
5316 //
5317 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5318 // Otherwise there was overflow and the result is hi2:0. In both cases the
5319 // result should represent the actual time at some point during the sequence
5320 // of three getregs.
5321 using namespace AMDGPU::Hwreg;
5322 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5324 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5325 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5326 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5327 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5328 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5330 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5331 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5332 .addReg(RegHi1)
5333 .addReg(RegHi2);
5334 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5335 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5336 .addReg(RegLo1)
5337 .addImm(0);
5338 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5339 .add(MI.getOperand(0))
5340 .addReg(RegLo)
5341 .addImm(AMDGPU::sub0)
5342 .addReg(RegHi2)
5343 .addImm(AMDGPU::sub1);
5344 MI.eraseFromParent();
5345 return BB;
5346 }
5347 case AMDGPU::SI_INDIRECT_SRC_V1:
5348 case AMDGPU::SI_INDIRECT_SRC_V2:
5349 case AMDGPU::SI_INDIRECT_SRC_V4:
5350 case AMDGPU::SI_INDIRECT_SRC_V8:
5351 case AMDGPU::SI_INDIRECT_SRC_V9:
5352 case AMDGPU::SI_INDIRECT_SRC_V10:
5353 case AMDGPU::SI_INDIRECT_SRC_V11:
5354 case AMDGPU::SI_INDIRECT_SRC_V12:
5355 case AMDGPU::SI_INDIRECT_SRC_V16:
5356 case AMDGPU::SI_INDIRECT_SRC_V32:
5357 return emitIndirectSrc(MI, *BB, *getSubtarget());
5358 case AMDGPU::SI_INDIRECT_DST_V1:
5359 case AMDGPU::SI_INDIRECT_DST_V2:
5360 case AMDGPU::SI_INDIRECT_DST_V4:
5361 case AMDGPU::SI_INDIRECT_DST_V8:
5362 case AMDGPU::SI_INDIRECT_DST_V9:
5363 case AMDGPU::SI_INDIRECT_DST_V10:
5364 case AMDGPU::SI_INDIRECT_DST_V11:
5365 case AMDGPU::SI_INDIRECT_DST_V12:
5366 case AMDGPU::SI_INDIRECT_DST_V16:
5367 case AMDGPU::SI_INDIRECT_DST_V32:
5368 return emitIndirectDst(MI, *BB, *getSubtarget());
5369 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5370 case AMDGPU::SI_KILL_I1_PSEUDO:
5371 return splitKillBlock(MI, BB);
5372 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5374 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5375 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5376
5377 Register Dst = MI.getOperand(0).getReg();
5378 const MachineOperand &Src0 = MI.getOperand(1);
5379 const MachineOperand &Src1 = MI.getOperand(2);
5380 const DebugLoc &DL = MI.getDebugLoc();
5381 Register SrcCond = MI.getOperand(3).getReg();
5382
5383 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5384 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5385 const auto *CondRC = TRI->getWaveMaskRegClass();
5386 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5387
5388 const TargetRegisterClass *Src0RC = Src0.isReg()
5389 ? MRI.getRegClass(Src0.getReg())
5390 : &AMDGPU::VReg_64RegClass;
5391 const TargetRegisterClass *Src1RC = Src1.isReg()
5392 ? MRI.getRegClass(Src1.getReg())
5393 : &AMDGPU::VReg_64RegClass;
5394
5395 const TargetRegisterClass *Src0SubRC =
5396 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5397 const TargetRegisterClass *Src1SubRC =
5398 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5399
5400 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5401 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5402 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5403 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5404
5405 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5406 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5407 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5408 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5409
5410 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5411 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5412 .addImm(0)
5413 .add(Src0Sub0)
5414 .addImm(0)
5415 .add(Src1Sub0)
5416 .addReg(SrcCondCopy);
5417 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5418 .addImm(0)
5419 .add(Src0Sub1)
5420 .addImm(0)
5421 .add(Src1Sub1)
5422 .addReg(SrcCondCopy);
5423
5424 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5425 .addReg(DstLo)
5426 .addImm(AMDGPU::sub0)
5427 .addReg(DstHi)
5428 .addImm(AMDGPU::sub1);
5429 MI.eraseFromParent();
5430 return BB;
5431 }
5432 case AMDGPU::SI_BR_UNDEF: {
5434 const DebugLoc &DL = MI.getDebugLoc();
5435 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5436 .add(MI.getOperand(0));
5437 Br->getOperand(1).setIsUndef(); // read undef SCC
5438 MI.eraseFromParent();
5439 return BB;
5440 }
5441 case AMDGPU::ADJCALLSTACKUP:
5442 case AMDGPU::ADJCALLSTACKDOWN: {
5444 MachineInstrBuilder MIB(*MF, &MI);
5445 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5446 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5447 return BB;
5448 }
5449 case AMDGPU::SI_CALL_ISEL: {
5451 const DebugLoc &DL = MI.getDebugLoc();
5452
5453 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5454
5456 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5457
5458 for (const MachineOperand &MO : MI.operands())
5459 MIB.add(MO);
5460
5461 MIB.cloneMemRefs(MI);
5462 MI.eraseFromParent();
5463 return BB;
5464 }
5465 case AMDGPU::V_ADD_CO_U32_e32:
5466 case AMDGPU::V_SUB_CO_U32_e32:
5467 case AMDGPU::V_SUBREV_CO_U32_e32: {
5468 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5469 const DebugLoc &DL = MI.getDebugLoc();
5470 unsigned Opc = MI.getOpcode();
5471
5472 bool NeedClampOperand = false;
5473 if (TII->pseudoToMCOpcode(Opc) == -1) {
5474 Opc = AMDGPU::getVOPe64(Opc);
5475 NeedClampOperand = true;
5476 }
5477
5478 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5479 if (TII->isVOP3(*I)) {
5480 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5481 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5482 I.addReg(TRI->getVCC(), RegState::Define);
5483 }
5484 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5485 if (NeedClampOperand)
5486 I.addImm(0); // clamp bit for e64 encoding
5487
5488 TII->legalizeOperands(*I);
5489
5490 MI.eraseFromParent();
5491 return BB;
5492 }
5493 case AMDGPU::V_ADDC_U32_e32:
5494 case AMDGPU::V_SUBB_U32_e32:
5495 case AMDGPU::V_SUBBREV_U32_e32:
5496 // These instructions have an implicit use of vcc which counts towards the
5497 // constant bus limit.
5498 TII->legalizeOperands(MI);
5499 return BB;
5500 case AMDGPU::DS_GWS_INIT:
5501 case AMDGPU::DS_GWS_SEMA_BR:
5502 case AMDGPU::DS_GWS_BARRIER:
5503 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5504 [[fallthrough]];
5505 case AMDGPU::DS_GWS_SEMA_V:
5506 case AMDGPU::DS_GWS_SEMA_P:
5507 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5508 // A s_waitcnt 0 is required to be the instruction immediately following.
5509 if (getSubtarget()->hasGWSAutoReplay()) {
5511 return BB;
5512 }
5513
5514 return emitGWSMemViolTestLoop(MI, BB);
5515 case AMDGPU::S_SETREG_B32: {
5516 // Try to optimize cases that only set the denormal mode or rounding mode.
5517 //
5518 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5519 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5520 // instead.
5521 //
5522 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5523 // allow you to have a no side effect instruction in the output of a
5524 // sideeffecting pattern.
5525 auto [ID, Offset, Width] =
5526 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5528 return BB;
5529
5530 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5531 const unsigned SetMask = WidthMask << Offset;
5532
5533 if (getSubtarget()->hasDenormModeInst()) {
5534 unsigned SetDenormOp = 0;
5535 unsigned SetRoundOp = 0;
5536
5537 // The dedicated instructions can only set the whole denorm or round mode
5538 // at once, not a subset of bits in either.
5539 if (SetMask ==
5541 // If this fully sets both the round and denorm mode, emit the two
5542 // dedicated instructions for these.
5543 SetRoundOp = AMDGPU::S_ROUND_MODE;
5544 SetDenormOp = AMDGPU::S_DENORM_MODE;
5545 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5546 SetRoundOp = AMDGPU::S_ROUND_MODE;
5547 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5548 SetDenormOp = AMDGPU::S_DENORM_MODE;
5549 }
5550
5551 if (SetRoundOp || SetDenormOp) {
5553 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5554 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5555 unsigned ImmVal = Def->getOperand(1).getImm();
5556 if (SetRoundOp) {
5557 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5558 .addImm(ImmVal & 0xf);
5559
5560 // If we also have the denorm mode, get just the denorm mode bits.
5561 ImmVal >>= 4;
5562 }
5563
5564 if (SetDenormOp) {
5565 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5566 .addImm(ImmVal & 0xf);
5567 }
5568
5569 MI.eraseFromParent();
5570 return BB;
5571 }
5572 }
5573 }
5574
5575 // If only FP bits are touched, used the no side effects pseudo.
5576 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5577 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5578 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5579
5580 return BB;
5581 }
5582 case AMDGPU::S_INVERSE_BALLOT_U32:
5583 case AMDGPU::S_INVERSE_BALLOT_U64:
5584 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5585 // necessary. After that they are equivalent to a COPY.
5586 MI.setDesc(TII->get(AMDGPU::COPY));
5587 return BB;
5588 case AMDGPU::ENDPGM_TRAP: {
5589 const DebugLoc &DL = MI.getDebugLoc();
5590 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5591 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5592 MI.addOperand(MachineOperand::CreateImm(0));
5593 return BB;
5594 }
5595
5596 // We need a block split to make the real endpgm a terminator. We also don't
5597 // want to break phis in successor blocks, so we can't just delete to the
5598 // end of the block.
5599
5600 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5602 MF->push_back(TrapBB);
5603 // clang-format off
5604 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5605 .addImm(0);
5606 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5607 .addMBB(TrapBB);
5608 // clang-format on
5609
5610 BB->addSuccessor(TrapBB);
5611 MI.eraseFromParent();
5612 return SplitBB;
5613 }
5614 case AMDGPU::SIMULATED_TRAP: {
5615 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5617 MachineBasicBlock *SplitBB =
5618 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5619 MI.eraseFromParent();
5620 return SplitBB;
5621 }
5622 default:
5623 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5624 if (!MI.mayStore())
5626 return BB;
5627 }
5629 }
5630}
5631
5633 // This currently forces unfolding various combinations of fsub into fma with
5634 // free fneg'd operands. As long as we have fast FMA (controlled by
5635 // isFMAFasterThanFMulAndFAdd), we should perform these.
5636
5637 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5638 // most of these combines appear to be cycle neutral but save on instruction
5639 // count / code size.
5640 return true;
5641}
5642
5644
5646 EVT VT) const {
5647 if (!VT.isVector()) {
5648 return MVT::i1;
5649 }
5650 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5651}
5652
5654 // TODO: Should i16 be used always if legal? For now it would force VALU
5655 // shifts.
5656 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5657}
5658
5660 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5661 ? Ty.changeElementSize(16)
5662 : Ty.changeElementSize(32);
5663}
5664
5665// Answering this is somewhat tricky and depends on the specific device which
5666// have different rates for fma or all f64 operations.
5667//
5668// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5669// regardless of which device (although the number of cycles differs between
5670// devices), so it is always profitable for f64.
5671//
5672// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5673// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5674// which we can always do even without fused FP ops since it returns the same
5675// result as the separate operations and since it is always full
5676// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5677// however does not support denormals, so we do report fma as faster if we have
5678// a fast fma device and require denormals.
5679//
5681 EVT VT) const {
5682 VT = VT.getScalarType();
5683
5684 switch (VT.getSimpleVT().SimpleTy) {
5685 case MVT::f32: {
5686 // If mad is not available this depends only on if f32 fma is full rate.
5687 if (!Subtarget->hasMadMacF32Insts())
5688 return Subtarget->hasFastFMAF32();
5689
5690 // Otherwise f32 mad is always full rate and returns the same result as
5691 // the separate operations so should be preferred over fma.
5692 // However does not support denormals.
5694 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5695
5696 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5697 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5698 }
5699 case MVT::f64:
5700 return true;
5701 case MVT::f16:
5702 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5703 default:
5704 break;
5705 }
5706
5707 return false;
5708}
5709
5711 LLT Ty) const {
5712 switch (Ty.getScalarSizeInBits()) {
5713 case 16:
5714 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5715 case 32:
5716 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5717 case 64:
5718 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5719 default:
5720 break;
5721 }
5722
5723 return false;
5724}
5725
5727 if (!Ty.isScalar())
5728 return false;
5729
5730 if (Ty.getScalarSizeInBits() == 16)
5731 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5732 if (Ty.getScalarSizeInBits() == 32)
5733 return Subtarget->hasMadMacF32Insts() &&
5734 denormalModeIsFlushAllF32(*MI.getMF());
5735
5736 return false;
5737}
5738
5740 const SDNode *N) const {
5741 // TODO: Check future ftz flag
5742 // v_mad_f32/v_mac_f32 do not support denormals.
5743 EVT VT = N->getValueType(0);
5744 if (VT == MVT::f32)
5745 return Subtarget->hasMadMacF32Insts() &&
5747 if (VT == MVT::f16) {
5748 return Subtarget->hasMadF16() &&
5750 }
5751
5752 return false;
5753}
5754
5755//===----------------------------------------------------------------------===//
5756// Custom DAG Lowering Operations
5757//===----------------------------------------------------------------------===//
5758
5759// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5760// wider vector type is legal.
5762 SelectionDAG &DAG) const {
5763 unsigned Opc = Op.getOpcode();
5764 EVT VT = Op.getValueType();
5765 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5766 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5767 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5768 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5769
5770 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5771
5772 SDLoc SL(Op);
5773 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5774 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5775
5776 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5777}
5778
5779// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5780// wider vector type is legal.
5782 SelectionDAG &DAG) const {
5783 unsigned Opc = Op.getOpcode();
5784 EVT VT = Op.getValueType();
5785 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5786 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5787 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5788 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5789
5790 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5791 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5792
5793 SDLoc SL(Op);
5794
5795 SDValue OpLo =
5796 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5797 SDValue OpHi =
5798 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5799
5800 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5801}
5802
5804 SelectionDAG &DAG) const {
5805 unsigned Opc = Op.getOpcode();
5806 EVT VT = Op.getValueType();
5807 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5808 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5809 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5810 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5811 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5812 VT == MVT::v32bf16);
5813
5814 SDValue Op0 = Op.getOperand(0);
5815 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5816 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5817 : std::pair(Op0, Op0);
5818
5819 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5820 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5821
5822 SDLoc SL(Op);
5823 auto ResVT = DAG.GetSplitDestVTs(VT);
5824
5825 SDValue OpLo =
5826 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5827 SDValue OpHi =
5828 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5829
5830 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5831}
5832
5834 switch (Op.getOpcode()) {
5835 default:
5837 case ISD::BRCOND:
5838 return LowerBRCOND(Op, DAG);
5839 case ISD::RETURNADDR:
5840 return LowerRETURNADDR(Op, DAG);
5841 case ISD::LOAD: {
5842 SDValue Result = LowerLOAD(Op, DAG);
5843 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5844 "Load should return a value and a chain");
5845 return Result;
5846 }
5847 case ISD::FSQRT: {
5848 EVT VT = Op.getValueType();
5849 if (VT == MVT::f32)
5850 return lowerFSQRTF32(Op, DAG);
5851 if (VT == MVT::f64)
5852 return lowerFSQRTF64(Op, DAG);
5853 return SDValue();
5854 }
5855 case ISD::FSIN:
5856 case ISD::FCOS:
5857 return LowerTrig(Op, DAG);
5858 case ISD::SELECT:
5859 return LowerSELECT(Op, DAG);
5860 case ISD::FDIV:
5861 return LowerFDIV(Op, DAG);
5862 case ISD::FFREXP:
5863 return LowerFFREXP(Op, DAG);
5865 return LowerATOMIC_CMP_SWAP(Op, DAG);
5866 case ISD::STORE:
5867 return LowerSTORE(Op, DAG);
5868 case ISD::GlobalAddress: {
5871 return LowerGlobalAddress(MFI, Op, DAG);
5872 }
5874 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5876 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5878 return LowerINTRINSIC_VOID(Op, DAG);
5879 case ISD::ADDRSPACECAST:
5880 return lowerADDRSPACECAST(Op, DAG);
5882 return lowerINSERT_SUBVECTOR(Op, DAG);
5884 return lowerINSERT_VECTOR_ELT(Op, DAG);
5886 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5888 return lowerVECTOR_SHUFFLE(Op, DAG);
5890 return lowerSCALAR_TO_VECTOR(Op, DAG);
5891 case ISD::BUILD_VECTOR:
5892 return lowerBUILD_VECTOR(Op, DAG);
5893 case ISD::FP_ROUND:
5895 return lowerFP_ROUND(Op, DAG);
5896 case ISD::TRAP:
5897 return lowerTRAP(Op, DAG);
5898 case ISD::DEBUGTRAP:
5899 return lowerDEBUGTRAP(Op, DAG);
5900 case ISD::ABS:
5901 case ISD::FABS:
5902 case ISD::FNEG:
5903 case ISD::FCANONICALIZE:
5904 case ISD::BSWAP:
5905 return splitUnaryVectorOp(Op, DAG);
5906 case ISD::FMINNUM:
5907 case ISD::FMAXNUM:
5908 return lowerFMINNUM_FMAXNUM(Op, DAG);
5909 case ISD::FLDEXP:
5910 case ISD::STRICT_FLDEXP:
5911 return lowerFLDEXP(Op, DAG);
5912 case ISD::FMA:
5913 return splitTernaryVectorOp(Op, DAG);
5914 case ISD::FP_TO_SINT:
5915 case ISD::FP_TO_UINT:
5916 return LowerFP_TO_INT(Op, DAG);
5917 case ISD::SHL:
5918 case ISD::SRA:
5919 case ISD::SRL:
5920 case ISD::ADD:
5921 case ISD::SUB:
5922 case ISD::SMIN:
5923 case ISD::SMAX:
5924 case ISD::UMIN:
5925 case ISD::UMAX:
5926 case ISD::FADD:
5927 case ISD::FMUL:
5928 case ISD::FMINNUM_IEEE:
5929 case ISD::FMAXNUM_IEEE:
5930 case ISD::FMINIMUM:
5931 case ISD::FMAXIMUM:
5932 case ISD::FMINIMUMNUM:
5933 case ISD::FMAXIMUMNUM:
5934 case ISD::UADDSAT:
5935 case ISD::USUBSAT:
5936 case ISD::SADDSAT:
5937 case ISD::SSUBSAT:
5938 return splitBinaryVectorOp(Op, DAG);
5939 case ISD::MUL:
5940 return lowerMUL(Op, DAG);
5941 case ISD::SMULO:
5942 case ISD::UMULO:
5943 return lowerXMULO(Op, DAG);
5944 case ISD::SMUL_LOHI:
5945 case ISD::UMUL_LOHI:
5946 return lowerXMUL_LOHI(Op, DAG);
5948 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5949 case ISD::STACKSAVE:
5950 return LowerSTACKSAVE(Op, DAG);
5951 case ISD::GET_ROUNDING:
5952 return lowerGET_ROUNDING(Op, DAG);
5953 case ISD::SET_ROUNDING:
5954 return lowerSET_ROUNDING(Op, DAG);
5955 case ISD::PREFETCH:
5956 return lowerPREFETCH(Op, DAG);
5957 case ISD::FP_EXTEND:
5959 return lowerFP_EXTEND(Op, DAG);
5960 case ISD::GET_FPENV:
5961 return lowerGET_FPENV(Op, DAG);
5962 case ISD::SET_FPENV:
5963 return lowerSET_FPENV(Op, DAG);
5964 }
5965 return SDValue();
5966}
5967
5968// Used for D16: Casts the result of an instruction into the right vector,
5969// packs values if loads return unpacked values.
5971 const SDLoc &DL, SelectionDAG &DAG,
5972 bool Unpacked) {
5973 if (!LoadVT.isVector())
5974 return Result;
5975
5976 // Cast back to the original packed type or to a larger type that is a
5977 // multiple of 32 bit for D16. Widening the return type is a required for
5978 // legalization.
5979 EVT FittingLoadVT = LoadVT;
5980 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5981 FittingLoadVT =
5983 LoadVT.getVectorNumElements() + 1);
5984 }
5985
5986 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5987 // Truncate to v2i16/v4i16.
5988 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5989
5990 // Workaround legalizer not scalarizing truncate after vector op
5991 // legalization but not creating intermediate vector trunc.
5993 DAG.ExtractVectorElements(Result, Elts);
5994 for (SDValue &Elt : Elts)
5995 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5996
5997 // Pad illegal v1i16/v3fi6 to v4i16
5998 if ((LoadVT.getVectorNumElements() % 2) == 1)
5999 Elts.push_back(DAG.getUNDEF(MVT::i16));
6000
6001 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6002
6003 // Bitcast to original type (v2f16/v4f16).
6004 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6005 }
6006
6007 // Cast back to the original packed type.
6008 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6009}
6010
6011SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6012 SelectionDAG &DAG,
6014 bool IsIntrinsic) const {
6015 SDLoc DL(M);
6016
6017 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6018 EVT LoadVT = M->getValueType(0);
6019
6020 EVT EquivLoadVT = LoadVT;
6021 if (LoadVT.isVector()) {
6022 if (Unpacked) {
6023 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6024 LoadVT.getVectorNumElements());
6025 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6026 // Widen v3f16 to legal type
6027 EquivLoadVT =
6029 LoadVT.getVectorNumElements() + 1);
6030 }
6031 }
6032
6033 // Change from v4f16/v2f16 to EquivLoadVT.
6034 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6035
6037 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6038 M->getMemoryVT(), M->getMemOperand());
6039
6040 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6041
6042 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6043}
6044
6045SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6046 SelectionDAG &DAG,
6047 ArrayRef<SDValue> Ops) const {
6048 SDLoc DL(M);
6049 EVT LoadVT = M->getValueType(0);
6050 EVT EltType = LoadVT.getScalarType();
6051 EVT IntVT = LoadVT.changeTypeToInteger();
6052
6053 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6054
6055 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6056 bool IsTFE = M->getNumValues() == 3;
6057
6058 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6062
6063 if (IsD16) {
6064 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6065 }
6066
6067 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6068 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6069 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6070 IsTFE);
6071
6072 if (isTypeLegal(LoadVT)) {
6073 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6074 M->getMemOperand(), DAG);
6075 }
6076
6077 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6078 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6079 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6080 M->getMemOperand(), DAG);
6081 return DAG.getMergeValues(
6082 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6083 DL);
6084}
6085
6087 SelectionDAG &DAG) {
6088 EVT VT = N->getValueType(0);
6089 unsigned CondCode = N->getConstantOperandVal(3);
6090 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6091 return DAG.getUNDEF(VT);
6092
6093 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6094
6095 SDValue LHS = N->getOperand(1);
6096 SDValue RHS = N->getOperand(2);
6097
6098 SDLoc DL(N);
6099
6100 EVT CmpVT = LHS.getValueType();
6101 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6102 unsigned PromoteOp =
6104 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6105 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6106 }
6107
6108 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6109
6110 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6111 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6112
6113 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6114 DAG.getCondCode(CCOpcode));
6115 if (VT.bitsEq(CCVT))
6116 return SetCC;
6117 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6118}
6119
6121 SelectionDAG &DAG) {
6122 EVT VT = N->getValueType(0);
6123
6124 unsigned CondCode = N->getConstantOperandVal(3);
6125 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6126 return DAG.getUNDEF(VT);
6127
6128 SDValue Src0 = N->getOperand(1);
6129 SDValue Src1 = N->getOperand(2);
6130 EVT CmpVT = Src0.getValueType();
6131 SDLoc SL(N);
6132
6133 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6134 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6135 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6136 }
6137
6138 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6139 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6140 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6141 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6142 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6143 DAG.getCondCode(CCOpcode));
6144 if (VT.bitsEq(CCVT))
6145 return SetCC;
6146 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6147}
6148
6150 SelectionDAG &DAG) {
6151 EVT VT = N->getValueType(0);
6152 SDValue Src = N->getOperand(1);
6153 SDLoc SL(N);
6154
6155 if (Src.getOpcode() == ISD::SETCC) {
6156 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6157 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6158 Src.getOperand(1), Src.getOperand(2));
6159 }
6160 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6161 // (ballot 0) -> 0
6162 if (Arg->isZero())
6163 return DAG.getConstant(0, SL, VT);
6164
6165 // (ballot 1) -> EXEC/EXEC_LO
6166 if (Arg->isOne()) {
6167 Register Exec;
6168 if (VT.getScalarSizeInBits() == 32)
6169 Exec = AMDGPU::EXEC_LO;
6170 else if (VT.getScalarSizeInBits() == 64)
6171 Exec = AMDGPU::EXEC;
6172 else
6173 return SDValue();
6174
6175 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6176 }
6177 }
6178
6179 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6180 // ISD::SETNE)
6181 return DAG.getNode(
6182 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6183 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6184}
6185
6187 SelectionDAG &DAG) {
6188 EVT VT = N->getValueType(0);
6189 unsigned ValSize = VT.getSizeInBits();
6190 unsigned IID = N->getConstantOperandVal(0);
6191 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6192 IID == Intrinsic::amdgcn_permlanex16;
6193 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6194 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6195 SDLoc SL(N);
6196 MVT IntVT = MVT::getIntegerVT(ValSize);
6197 const GCNSubtarget *ST = TLI.getSubtarget();
6198 unsigned SplitSize = 32;
6199 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6200 ST->hasDPALU_DPP() &&
6201 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6202 SplitSize = 64;
6203
6204 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6205 SDValue Src2, MVT ValT) -> SDValue {
6207 switch (IID) {
6208 case Intrinsic::amdgcn_permlane16:
6209 case Intrinsic::amdgcn_permlanex16:
6210 case Intrinsic::amdgcn_update_dpp:
6211 Operands.push_back(N->getOperand(6));
6212 Operands.push_back(N->getOperand(5));
6213 Operands.push_back(N->getOperand(4));
6214 [[fallthrough]];
6215 case Intrinsic::amdgcn_writelane:
6216 Operands.push_back(Src2);
6217 [[fallthrough]];
6218 case Intrinsic::amdgcn_readlane:
6219 case Intrinsic::amdgcn_set_inactive:
6220 case Intrinsic::amdgcn_set_inactive_chain_arg:
6221 case Intrinsic::amdgcn_mov_dpp8:
6222 Operands.push_back(Src1);
6223 [[fallthrough]];
6224 case Intrinsic::amdgcn_readfirstlane:
6225 case Intrinsic::amdgcn_permlane64:
6226 Operands.push_back(Src0);
6227 break;
6228 default:
6229 llvm_unreachable("unhandled lane op");
6230 }
6231
6232 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6233 std::reverse(Operands.begin(), Operands.end());
6234
6235 if (SDNode *GL = N->getGluedNode()) {
6236 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6237 GL = GL->getOperand(0).getNode();
6238 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6239 SDValue(GL, 0)));
6240 }
6241
6242 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6243 };
6244
6245 SDValue Src0 = N->getOperand(1);
6246 SDValue Src1, Src2;
6247 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6248 IID == Intrinsic::amdgcn_mov_dpp8 ||
6249 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6250 Src1 = N->getOperand(2);
6251 if (IID == Intrinsic::amdgcn_writelane ||
6252 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6253 Src2 = N->getOperand(3);
6254 }
6255
6256 if (ValSize == SplitSize) {
6257 // Already legal
6258 return SDValue();
6259 }
6260
6261 if (ValSize < 32) {
6262 bool IsFloat = VT.isFloatingPoint();
6263 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6264 SL, MVT::i32);
6265
6266 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6267 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6268 SL, MVT::i32);
6269 }
6270
6271 if (IID == Intrinsic::amdgcn_writelane) {
6272 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6273 SL, MVT::i32);
6274 }
6275
6276 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6277 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6278 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6279 }
6280
6281 if (ValSize % SplitSize != 0)
6282 return SDValue();
6283
6284 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6285 EVT VT = N->getValueType(0);
6286 unsigned NE = VT.getVectorNumElements();
6287 EVT EltVT = VT.getVectorElementType();
6289 unsigned NumOperands = N->getNumOperands();
6290 SmallVector<SDValue, 4> Operands(NumOperands);
6291 SDNode *GL = N->getGluedNode();
6292
6293 // only handle convergencectrl_glue
6295
6296 for (unsigned i = 0; i != NE; ++i) {
6297 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6298 ++j) {
6299 SDValue Operand = N->getOperand(j);
6300 EVT OperandVT = Operand.getValueType();
6301 if (OperandVT.isVector()) {
6302 // A vector operand; extract a single element.
6303 EVT OperandEltVT = OperandVT.getVectorElementType();
6304 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6305 Operand, DAG.getVectorIdxConstant(i, SL));
6306 } else {
6307 // A scalar operand; just use it as is.
6308 Operands[j] = Operand;
6309 }
6310 }
6311
6312 if (GL)
6313 Operands[NumOperands - 1] =
6314 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6315 SDValue(GL->getOperand(0).getNode(), 0));
6316
6317 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6318 }
6319
6320 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6321 return DAG.getBuildVector(VecVT, SL, Scalars);
6322 };
6323
6324 if (VT.isVector()) {
6325 switch (MVT::SimpleValueType EltTy =
6327 case MVT::i32:
6328 case MVT::f32:
6329 if (SplitSize == 32) {
6330 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6331 return unrollLaneOp(LaneOp.getNode());
6332 }
6333 [[fallthrough]];
6334 case MVT::i16:
6335 case MVT::f16:
6336 case MVT::bf16: {
6337 unsigned SubVecNumElt =
6338 SplitSize / VT.getVectorElementType().getSizeInBits();
6339 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6341 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6342 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6343 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6344 DAG.getConstant(EltIdx, SL, MVT::i32));
6345
6346 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6347 IsPermLane16)
6348 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6349 DAG.getConstant(EltIdx, SL, MVT::i32));
6350
6351 if (IID == Intrinsic::amdgcn_writelane)
6352 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6353 DAG.getConstant(EltIdx, SL, MVT::i32));
6354
6355 Pieces.push_back(
6356 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6357 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6358 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6359 EltIdx += SubVecNumElt;
6360 }
6361 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6362 }
6363 default:
6364 // Handle all other cases by bitcasting to i32 vectors
6365 break;
6366 }
6367 }
6368
6369 MVT VecVT =
6370 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6371 Src0 = DAG.getBitcast(VecVT, Src0);
6372
6373 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6374 Src1 = DAG.getBitcast(VecVT, Src1);
6375
6376 if (IID == Intrinsic::amdgcn_writelane)
6377 Src2 = DAG.getBitcast(VecVT, Src2);
6378
6379 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6380 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6381 return DAG.getBitcast(VT, UnrolledLaneOp);
6382}
6383
6386 SelectionDAG &DAG) const {
6387 switch (N->getOpcode()) {
6389 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6390 Results.push_back(Res);
6391 return;
6392 }
6394 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6395 Results.push_back(Res);
6396 return;
6397 }
6399 unsigned IID = N->getConstantOperandVal(0);
6400 switch (IID) {
6401 case Intrinsic::amdgcn_make_buffer_rsrc:
6402 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6403 return;
6404 case Intrinsic::amdgcn_cvt_pkrtz: {
6405 SDValue Src0 = N->getOperand(1);
6406 SDValue Src1 = N->getOperand(2);
6407 SDLoc SL(N);
6408 SDValue Cvt =
6409 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6410 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6411 return;
6412 }
6413 case Intrinsic::amdgcn_cvt_pknorm_i16:
6414 case Intrinsic::amdgcn_cvt_pknorm_u16:
6415 case Intrinsic::amdgcn_cvt_pk_i16:
6416 case Intrinsic::amdgcn_cvt_pk_u16: {
6417 SDValue Src0 = N->getOperand(1);
6418 SDValue Src1 = N->getOperand(2);
6419 SDLoc SL(N);
6420 unsigned Opcode;
6421
6422 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6424 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6426 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6428 else
6430
6431 EVT VT = N->getValueType(0);
6432 if (isTypeLegal(VT))
6433 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6434 else {
6435 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6436 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6437 }
6438 return;
6439 }
6440 case Intrinsic::amdgcn_s_buffer_load: {
6441 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6442 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6443 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6444 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6445 // s_buffer_load_i8.
6446 if (!Subtarget->hasScalarSubwordLoads())
6447 return;
6448 SDValue Op = SDValue(N, 0);
6449 SDValue Rsrc = Op.getOperand(1);
6450 SDValue Offset = Op.getOperand(2);
6451 SDValue CachePolicy = Op.getOperand(3);
6452 EVT VT = Op.getValueType();
6453 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6454 SDLoc DL(Op);
6456 const DataLayout &DataLayout = DAG.getDataLayout();
6457 Align Alignment =
6463 VT.getStoreSize(), Alignment);
6464 SDValue LoadVal;
6465 if (!Offset->isDivergent()) {
6466 SDValue Ops[] = {Rsrc, // source register
6467 Offset, CachePolicy};
6468 SDValue BufferLoad =
6470 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6471 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6472 } else {
6473 SDValue Ops[] = {
6474 DAG.getEntryNode(), // Chain
6475 Rsrc, // rsrc
6476 DAG.getConstant(0, DL, MVT::i32), // vindex
6477 {}, // voffset
6478 {}, // soffset
6479 {}, // offset
6480 CachePolicy, // cachepolicy
6481 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6482 };
6483 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6484 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6485 }
6486 Results.push_back(LoadVal);
6487 return;
6488 }
6489 }
6490 break;
6491 }
6493 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6494 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6495 // FIXME: Hacky
6496 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6497 Results.push_back(Res.getOperand(I));
6498 }
6499 } else {
6500 Results.push_back(Res);
6501 Results.push_back(Res.getValue(1));
6502 }
6503 return;
6504 }
6505
6506 break;
6507 }
6508 case ISD::SELECT: {
6509 SDLoc SL(N);
6510 EVT VT = N->getValueType(0);
6511 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6512 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6513 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6514
6515 EVT SelectVT = NewVT;
6516 if (NewVT.bitsLT(MVT::i32)) {
6517 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6518 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6519 SelectVT = MVT::i32;
6520 }
6521
6522 SDValue NewSelect =
6523 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6524
6525 if (NewVT != SelectVT)
6526 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6527 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6528 return;
6529 }
6530 case ISD::FNEG: {
6531 if (N->getValueType(0) != MVT::v2f16)
6532 break;
6533
6534 SDLoc SL(N);
6535 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6536
6537 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6538 DAG.getConstant(0x80008000, SL, MVT::i32));
6539 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6540 return;
6541 }
6542 case ISD::FABS: {
6543 if (N->getValueType(0) != MVT::v2f16)
6544 break;
6545
6546 SDLoc SL(N);
6547 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6548
6549 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6550 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6551 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6552 return;
6553 }
6554 case ISD::FSQRT: {
6555 if (N->getValueType(0) != MVT::f16)
6556 break;
6557 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6558 break;
6559 }
6560 default:
6562 break;
6563 }
6564}
6565
6566/// Helper function for LowerBRCOND
6567static SDNode *findUser(SDValue Value, unsigned Opcode) {
6568
6569 for (SDUse &U : Value->uses()) {
6570 if (U.get() != Value)
6571 continue;
6572
6573 if (U.getUser()->getOpcode() == Opcode)
6574 return U.getUser();
6575 }
6576 return nullptr;
6577}
6578
6579unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6580 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6581 switch (Intr->getConstantOperandVal(1)) {
6582 case Intrinsic::amdgcn_if:
6583 return AMDGPUISD::IF;
6584 case Intrinsic::amdgcn_else:
6585 return AMDGPUISD::ELSE;
6586 case Intrinsic::amdgcn_loop:
6587 return AMDGPUISD::LOOP;
6588 case Intrinsic::amdgcn_end_cf:
6589 llvm_unreachable("should not occur");
6590 default:
6591 return 0;
6592 }
6593 }
6594
6595 // break, if_break, else_break are all only used as inputs to loop, not
6596 // directly as branch conditions.
6597 return 0;
6598}
6599
6601 const Triple &TT = getTargetMachine().getTargetTriple();
6605}
6606
6608 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6609 return false;
6610
6611 // FIXME: Either avoid relying on address space here or change the default
6612 // address space for functions to avoid the explicit check.
6613 return (GV->getValueType()->isFunctionTy() ||
6616}
6617
6619 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6620}
6621
6623 if (!GV->hasExternalLinkage())
6624 return true;
6625
6626 const auto OS = getTargetMachine().getTargetTriple().getOS();
6627 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6628}
6629
6630/// This transforms the control flow intrinsics to get the branch destination as
6631/// last parameter, also switches branch target with BR if the need arise
6632SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6633 SDLoc DL(BRCOND);
6634
6635 SDNode *Intr = BRCOND.getOperand(1).getNode();
6636 SDValue Target = BRCOND.getOperand(2);
6637 SDNode *BR = nullptr;
6638 SDNode *SetCC = nullptr;
6639
6640 if (Intr->getOpcode() == ISD::SETCC) {
6641 // As long as we negate the condition everything is fine
6642 SetCC = Intr;
6643 Intr = SetCC->getOperand(0).getNode();
6644
6645 } else {
6646 // Get the target from BR if we don't negate the condition
6647 BR = findUser(BRCOND, ISD::BR);
6648 assert(BR && "brcond missing unconditional branch user");
6649 Target = BR->getOperand(1);
6650 }
6651
6652 unsigned CFNode = isCFIntrinsic(Intr);
6653 if (CFNode == 0) {
6654 // This is a uniform branch so we don't need to legalize.
6655 return BRCOND;
6656 }
6657
6658 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6659 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6660
6661 assert(!SetCC ||
6662 (SetCC->getConstantOperandVal(1) == 1 &&
6663 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6664 ISD::SETNE));
6665
6666 // operands of the new intrinsic call
6668 if (HaveChain)
6669 Ops.push_back(BRCOND.getOperand(0));
6670
6671 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6672 Ops.push_back(Target);
6673
6674 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6675
6676 // build the new intrinsic call
6677 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6678
6679 if (!HaveChain) {
6680 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6681
6682 Result = DAG.getMergeValues(Ops, DL).getNode();
6683 }
6684
6685 if (BR) {
6686 // Give the branch instruction our target
6687 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6688 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6689 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6690 }
6691
6692 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6693
6694 // Copy the intrinsic results to registers
6695 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6697 if (!CopyToReg)
6698 continue;
6699
6700 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6701 SDValue(Result, i - 1), SDValue());
6702
6703 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6704 }
6705
6706 // Remove the old intrinsic from the chain
6707 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6708 Intr->getOperand(0));
6709
6710 return Chain;
6711}
6712
6713SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6714 MVT VT = Op.getSimpleValueType();
6715 SDLoc DL(Op);
6716 // Checking the depth
6717 if (Op.getConstantOperandVal(0) != 0)
6718 return DAG.getConstant(0, DL, VT);
6719
6722 // Check for kernel and shader functions
6723 if (Info->isEntryFunction())
6724 return DAG.getConstant(0, DL, VT);
6725
6726 MachineFrameInfo &MFI = MF.getFrameInfo();
6727 // There is a call to @llvm.returnaddress in this function
6728 MFI.setReturnAddressIsTaken(true);
6729
6731 // Get the return address reg and mark it as an implicit live-in
6732 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6733 getRegClassFor(VT, Op.getNode()->isDivergent()));
6734
6735 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6736}
6737
6738SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6739 const SDLoc &DL, EVT VT) const {
6740 return Op.getValueType().bitsLE(VT)
6741 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6742 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6743 DAG.getTargetConstant(0, DL, MVT::i32));
6744}
6745
6746SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6747 assert(Op.getValueType() == MVT::f16 &&
6748 "Do not know how to custom lower FP_ROUND for non-f16 type");
6749
6750 SDValue Src = Op.getOperand(0);
6751 EVT SrcVT = Src.getValueType();
6752 if (SrcVT != MVT::f64)
6753 return Op;
6754
6755 // TODO: Handle strictfp
6756 if (Op.getOpcode() != ISD::FP_ROUND)
6757 return Op;
6758
6759 SDLoc DL(Op);
6760
6761 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6762 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6763 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6764}
6765
6766SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6767 SelectionDAG &DAG) const {
6768 EVT VT = Op.getValueType();
6769 const MachineFunction &MF = DAG.getMachineFunction();
6771 bool IsIEEEMode = Info->getMode().IEEE;
6772
6773 // FIXME: Assert during selection that this is only selected for
6774 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6775 // mode functions, but this happens to be OK since it's only done in cases
6776 // where there is known no sNaN.
6777 if (IsIEEEMode)
6778 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6779
6780 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6781 VT == MVT::v16bf16)
6782 return splitBinaryVectorOp(Op, DAG);
6783 return Op;
6784}
6785
6786SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6787 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6788 EVT VT = Op.getValueType();
6789 assert(VT == MVT::f16);
6790
6791 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6792 EVT ExpVT = Exp.getValueType();
6793 if (ExpVT == MVT::i16)
6794 return Op;
6795
6796 SDLoc DL(Op);
6797
6798 // Correct the exponent type for f16 to i16.
6799 // Clamp the range of the exponent to the instruction's range.
6800
6801 // TODO: This should be a generic narrowing legalization, and can easily be
6802 // for GlobalISel.
6803
6804 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6805 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6806
6807 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6808 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6809
6810 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6811
6812 if (IsStrict) {
6813 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6814 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6815 }
6816
6817 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6818}
6819
6821 switch (Op->getOpcode()) {
6822 case ISD::SRA:
6823 case ISD::SMIN:
6824 case ISD::SMAX:
6825 return ISD::SIGN_EXTEND;
6826 case ISD::SRL:
6827 case ISD::UMIN:
6828 case ISD::UMAX:
6829 return ISD::ZERO_EXTEND;
6830 case ISD::ADD:
6831 case ISD::SUB:
6832 case ISD::AND:
6833 case ISD::OR:
6834 case ISD::XOR:
6835 case ISD::SHL:
6836 case ISD::SELECT:
6837 case ISD::MUL:
6838 // operation result won't be influenced by garbage high bits.
6839 // TODO: are all of those cases correct, and are there more?
6840 return ISD::ANY_EXTEND;
6841 case ISD::SETCC: {
6842 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6844 }
6845 default:
6846 llvm_unreachable("unexpected opcode!");
6847 }
6848}
6849
6850SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6851 DAGCombinerInfo &DCI) const {
6852 const unsigned Opc = Op.getOpcode();
6853 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6854 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6855 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6856 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6857 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6858
6859 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6860 : Op->getOperand(0).getValueType();
6861 auto ExtTy = OpTy.changeElementType(MVT::i32);
6862
6863 if (DCI.isBeforeLegalizeOps() ||
6864 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6865 return SDValue();
6866
6867 auto &DAG = DCI.DAG;
6868
6869 SDLoc DL(Op);
6870 SDValue LHS;
6871 SDValue RHS;
6872 if (Opc == ISD::SELECT) {
6873 LHS = Op->getOperand(1);
6874 RHS = Op->getOperand(2);
6875 } else {
6876 LHS = Op->getOperand(0);
6877 RHS = Op->getOperand(1);
6878 }
6879
6880 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6881 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6882
6883 // Special case: for shifts, the RHS always needs a zext.
6884 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6885 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6886 else
6887 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6888
6889 // setcc always return i1/i1 vec so no need to truncate after.
6890 if (Opc == ISD::SETCC) {
6891 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6892 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6893 }
6894
6895 // For other ops, we extend the operation's return type as well so we need to
6896 // truncate back to the original type.
6897 SDValue NewVal;
6898 if (Opc == ISD::SELECT)
6899 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6900 else
6901 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6902
6903 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6904}
6905
6906// Custom lowering for vector multiplications and s_mul_u64.
6907SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6908 EVT VT = Op.getValueType();
6909
6910 // Split vector operands.
6911 if (VT.isVector())
6912 return splitBinaryVectorOp(Op, DAG);
6913
6914 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6915
6916 // There are four ways to lower s_mul_u64:
6917 //
6918 // 1. If all the operands are uniform, then we lower it as it is.
6919 //
6920 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6921 // multiplications because there is not a vector equivalent of s_mul_u64.
6922 //
6923 // 3. If the cost model decides that it is more efficient to use vector
6924 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6925 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6926 //
6927 // 4. If the cost model decides to use vector registers and both of the
6928 // operands are zero-extended/sign-extended from 32-bits, then we split the
6929 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6930 // possible to check if the operands are zero-extended or sign-extended in
6931 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6932 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6933 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6934 // If the cost model decides that we have to use vector registers, then
6935 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6936 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6937 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6938 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6939 // SIInstrInfo.cpp .
6940
6941 if (Op->isDivergent())
6942 return SDValue();
6943
6944 SDValue Op0 = Op.getOperand(0);
6945 SDValue Op1 = Op.getOperand(1);
6946 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6947 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6948 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6949 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6950 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6951 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6952 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6953 SDLoc SL(Op);
6954 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6955 return SDValue(
6956 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6957 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6958 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6959 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6960 return SDValue(
6961 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6962 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6963 return Op;
6964}
6965
6966SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6967 EVT VT = Op.getValueType();
6968 SDLoc SL(Op);
6969 SDValue LHS = Op.getOperand(0);
6970 SDValue RHS = Op.getOperand(1);
6971 bool isSigned = Op.getOpcode() == ISD::SMULO;
6972
6973 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6974 const APInt &C = RHSC->getAPIntValue();
6975 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6976 if (C.isPowerOf2()) {
6977 // smulo(x, signed_min) is same as umulo(x, signed_min).
6978 bool UseArithShift = isSigned && !C.isMinSignedValue();
6979 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6980 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6981 SDValue Overflow =
6982 DAG.getSetCC(SL, MVT::i1,
6983 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
6984 Result, ShiftAmt),
6985 LHS, ISD::SETNE);
6986 return DAG.getMergeValues({Result, Overflow}, SL);
6987 }
6988 }
6989
6990 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6991 SDValue Top =
6992 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
6993
6994 SDValue Sign = isSigned
6995 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6996 DAG.getConstant(VT.getScalarSizeInBits() - 1,
6997 SL, MVT::i32))
6998 : DAG.getConstant(0, SL, VT);
6999 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7000
7001 return DAG.getMergeValues({Result, Overflow}, SL);
7002}
7003
7004SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7005 if (Op->isDivergent()) {
7006 // Select to V_MAD_[IU]64_[IU]32.
7007 return Op;
7008 }
7009 if (Subtarget->hasSMulHi()) {
7010 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7011 return SDValue();
7012 }
7013 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7014 // calculate the high part, so we might as well do the whole thing with
7015 // V_MAD_[IU]64_[IU]32.
7016 return Op;
7017}
7018
7019SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7020 if (!Subtarget->isTrapHandlerEnabled() ||
7022 return lowerTrapEndpgm(Op, DAG);
7023
7024 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7025 : lowerTrapHsaQueuePtr(Op, DAG);
7026}
7027
7028SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7029 SDLoc SL(Op);
7030 SDValue Chain = Op.getOperand(0);
7031 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7032}
7033
7034SDValue
7035SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7036 const SDLoc &DL, Align Alignment,
7037 ImplicitParameter Param) const {
7040 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7042 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7045}
7046
7047SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7048 SelectionDAG &DAG) const {
7049 SDLoc SL(Op);
7050 SDValue Chain = Op.getOperand(0);
7051
7052 SDValue QueuePtr;
7053 // For code object version 5, QueuePtr is passed through implicit kernarg.
7054 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7056 QueuePtr =
7057 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7058 } else {
7061 Register UserSGPR = Info->getQueuePtrUserSGPR();
7062
7063 if (UserSGPR == AMDGPU::NoRegister) {
7064 // We probably are in a function incorrectly marked with
7065 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7066 // trap, so just use a null pointer.
7067 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7068 } else {
7069 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7070 MVT::i64);
7071 }
7072 }
7073
7074 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7075 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7076
7078 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7079 ToReg.getValue(1)};
7080 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7081}
7082
7083SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7084 SDLoc SL(Op);
7085 SDValue Chain = Op.getOperand(0);
7086
7087 // We need to simulate the 's_trap 2' instruction on targets that run in
7088 // PRIV=1 (where it is treated as a nop).
7089 if (Subtarget->hasPrivEnabledTrap2NopBug())
7090 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7091
7093 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7094 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7095}
7096
7097SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7098 SDLoc SL(Op);
7099 SDValue Chain = Op.getOperand(0);
7101
7102 if (!Subtarget->isTrapHandlerEnabled() ||
7105 "debugtrap handler not supported",
7106 Op.getDebugLoc(), DS_Warning);
7107 LLVMContext &Ctx = MF.getFunction().getContext();
7108 Ctx.diagnose(NoTrap);
7109 return Chain;
7110 }
7111
7112 uint64_t TrapID =
7114 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7115 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7116}
7117
7118SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7119 SelectionDAG &DAG) const {
7120 if (Subtarget->hasApertureRegs()) {
7121 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7122 ? AMDGPU::SRC_SHARED_BASE
7123 : AMDGPU::SRC_PRIVATE_BASE;
7124 // Note: this feature (register) is broken. When used as a 32-bit operand,
7125 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7126 // bits.
7127 //
7128 // To work around the issue, directly emit a 64 bit mov from this register
7129 // then extract the high bits. Note that this shouldn't even result in a
7130 // shift being emitted and simply become a pair of registers (e.g.):
7131 // s_mov_b64 s[6:7], src_shared_base
7132 // v_mov_b32_e32 v1, s7
7133 //
7134 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7135 // coalescing would kick in and it would think it's okay to use the "HI"
7136 // subregister directly (instead of extracting the HI 32 bits) which is an
7137 // artificial (unusable) register.
7138 // Register TableGen definitions would need an overhaul to get rid of the
7139 // artificial "HI" aperture registers and prevent this kind of issue from
7140 // happening.
7141 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7142 DAG.getRegister(ApertureRegNo, MVT::i64));
7143 return DAG.getNode(
7144 ISD::TRUNCATE, DL, MVT::i32,
7145 DAG.getNode(ISD::SRL, DL, MVT::i64,
7146 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7147 }
7148
7149 // For code object version 5, private_base and shared_base are passed through
7150 // implicit kernargs.
7151 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7155 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7156 }
7157
7160 Register UserSGPR = Info->getQueuePtrUserSGPR();
7161 if (UserSGPR == AMDGPU::NoRegister) {
7162 // We probably are in a function incorrectly marked with
7163 // amdgpu-no-queue-ptr. This is undefined.
7164 return DAG.getUNDEF(MVT::i32);
7165 }
7166
7167 SDValue QueuePtr =
7168 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7169
7170 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7171 // private_segment_aperture_base_hi.
7172 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7173
7174 SDValue Ptr =
7175 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7176
7177 // TODO: Use custom target PseudoSourceValue.
7178 // TODO: We should use the value from the IR intrinsic call, but it might not
7179 // be available and how do we get it?
7181 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7182 commonAlignment(Align(64), StructOffset),
7185}
7186
7187/// Return true if the value is a known valid address, such that a null check is
7188/// not necessary.
7190 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7191 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7192 isa<BasicBlockSDNode>(Val))
7193 return true;
7194
7195 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7196 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7197
7198 // TODO: Search through arithmetic, handle arguments and loads
7199 // marked nonnull.
7200 return false;
7201}
7202
7203SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7204 SelectionDAG &DAG) const {
7205 SDLoc SL(Op);
7206
7207 const AMDGPUTargetMachine &TM =
7208 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7209
7210 unsigned DestAS, SrcAS;
7211 SDValue Src;
7212 bool IsNonNull = false;
7213 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7214 SrcAS = ASC->getSrcAddressSpace();
7215 Src = ASC->getOperand(0);
7216 DestAS = ASC->getDestAddressSpace();
7217 } else {
7218 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7219 Op.getConstantOperandVal(0) ==
7220 Intrinsic::amdgcn_addrspacecast_nonnull);
7221 Src = Op->getOperand(1);
7222 SrcAS = Op->getConstantOperandVal(2);
7223 DestAS = Op->getConstantOperandVal(3);
7224 IsNonNull = true;
7225 }
7226
7227 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7228
7229 // flat -> local/private
7230 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7231 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7232 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7233 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7234
7235 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7236 return Ptr;
7237
7238 unsigned NullVal = TM.getNullPointerValue(DestAS);
7239 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7240 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7241
7242 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7243 SegmentNullPtr);
7244 }
7245 }
7246
7247 // local/private -> flat
7248 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7249 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7250 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7251
7252 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7253 SDValue CvtPtr =
7254 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7255 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7256
7257 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7258 return CvtPtr;
7259
7260 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7261 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7262
7263 SDValue NonNull =
7264 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7265
7266 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7267 FlatNullPtr);
7268 }
7269 }
7270
7271 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7272 Op.getValueType() == MVT::i64) {
7275 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7276 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7277 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7278 }
7279
7280 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7281 Src.getValueType() == MVT::i64)
7282 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7283
7284 // global <-> flat are no-ops and never emitted.
7285
7286 const MachineFunction &MF = DAG.getMachineFunction();
7287 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7288 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7289 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7290
7291 return DAG.getUNDEF(Op->getValueType(0));
7292}
7293
7294// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7295// the small vector and inserting them into the big vector. That is better than
7296// the default expansion of doing it via a stack slot. Even though the use of
7297// the stack slot would be optimized away afterwards, the stack slot itself
7298// remains.
7299SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7300 SelectionDAG &DAG) const {
7301 SDValue Vec = Op.getOperand(0);
7302 SDValue Ins = Op.getOperand(1);
7303 SDValue Idx = Op.getOperand(2);
7304 EVT VecVT = Vec.getValueType();
7305 EVT InsVT = Ins.getValueType();
7306 EVT EltVT = VecVT.getVectorElementType();
7307 unsigned InsNumElts = InsVT.getVectorNumElements();
7308 unsigned IdxVal = Idx->getAsZExtVal();
7309 SDLoc SL(Op);
7310
7311 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7312 // Insert 32-bit registers at a time.
7313 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7314
7315 unsigned VecNumElts = VecVT.getVectorNumElements();
7316 EVT NewVecVT =
7317 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7318 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7320 MVT::i32, InsNumElts / 2);
7321
7322 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7323 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7324
7325 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7326 SDValue Elt;
7327 if (InsNumElts == 2) {
7328 Elt = Ins;
7329 } else {
7330 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7331 DAG.getConstant(I, SL, MVT::i32));
7332 }
7333 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7334 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7335 }
7336
7337 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7338 }
7339
7340 for (unsigned I = 0; I != InsNumElts; ++I) {
7341 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7342 DAG.getConstant(I, SL, MVT::i32));
7343 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7344 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7345 }
7346 return Vec;
7347}
7348
7349SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7350 SelectionDAG &DAG) const {
7351 SDValue Vec = Op.getOperand(0);
7352 SDValue InsVal = Op.getOperand(1);
7353 SDValue Idx = Op.getOperand(2);
7354 EVT VecVT = Vec.getValueType();
7355 EVT EltVT = VecVT.getVectorElementType();
7356 unsigned VecSize = VecVT.getSizeInBits();
7357 unsigned EltSize = EltVT.getSizeInBits();
7358 SDLoc SL(Op);
7359
7360 // Specially handle the case of v4i16 with static indexing.
7361 unsigned NumElts = VecVT.getVectorNumElements();
7362 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7363 if (NumElts == 4 && EltSize == 16 && KIdx) {
7364 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7365
7366 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7367 DAG.getConstant(0, SL, MVT::i32));
7368 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7369 DAG.getConstant(1, SL, MVT::i32));
7370
7371 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7372 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7373
7374 unsigned Idx = KIdx->getZExtValue();
7375 bool InsertLo = Idx < 2;
7376 SDValue InsHalf = DAG.getNode(
7377 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7378 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7379 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7380
7381 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7382
7383 SDValue Concat =
7384 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7385 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7386
7387 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7388 }
7389
7390 // Static indexing does not lower to stack access, and hence there is no need
7391 // for special custom lowering to avoid stack access.
7392 if (isa<ConstantSDNode>(Idx))
7393 return SDValue();
7394
7395 // Avoid stack access for dynamic indexing by custom lowering to
7396 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7397
7398 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7399
7400 MVT IntVT = MVT::getIntegerVT(VecSize);
7401
7402 // Convert vector index to bit-index and get the required bit mask.
7403 assert(isPowerOf2_32(EltSize));
7404 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7405 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7406 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7407 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7408 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7409
7410 // 1. Create a congruent vector with the target value in each element.
7411 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7412 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7413
7414 // 2. Mask off all other indices except the required index within (1).
7415 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7416
7417 // 3. Mask off the required index within the target vector.
7418 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7419 SDValue RHS =
7420 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7421
7422 // 4. Get (2) and (3) ORed into the target vector.
7423 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7424
7425 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7426}
7427
7428SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7429 SelectionDAG &DAG) const {
7430 SDLoc SL(Op);
7431
7432 EVT ResultVT = Op.getValueType();
7433 SDValue Vec = Op.getOperand(0);
7434 SDValue Idx = Op.getOperand(1);
7435 EVT VecVT = Vec.getValueType();
7436 unsigned VecSize = VecVT.getSizeInBits();
7437 EVT EltVT = VecVT.getVectorElementType();
7438
7439 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7440
7441 // Make sure we do any optimizations that will make it easier to fold
7442 // source modifiers before obscuring it with bit operations.
7443
7444 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7445 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7446 return Combined;
7447
7448 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7449 SDValue Lo, Hi;
7450 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7451
7452 if (VecSize == 128) {
7453 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7454 Lo = DAG.getBitcast(LoVT,
7455 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7456 DAG.getConstant(0, SL, MVT::i32)));
7457 Hi = DAG.getBitcast(HiVT,
7458 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7459 DAG.getConstant(1, SL, MVT::i32)));
7460 } else if (VecSize == 256) {
7461 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7462 SDValue Parts[4];
7463 for (unsigned P = 0; P < 4; ++P) {
7464 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7465 DAG.getConstant(P, SL, MVT::i32));
7466 }
7467
7468 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7469 Parts[0], Parts[1]));
7470 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7471 Parts[2], Parts[3]));
7472 } else {
7473 assert(VecSize == 512);
7474
7475 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7476 SDValue Parts[8];
7477 for (unsigned P = 0; P < 8; ++P) {
7478 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7479 DAG.getConstant(P, SL, MVT::i32));
7480 }
7481
7482 Lo = DAG.getBitcast(LoVT,
7483 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7484 Parts[0], Parts[1], Parts[2], Parts[3]));
7485 Hi = DAG.getBitcast(HiVT,
7486 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7487 Parts[4], Parts[5], Parts[6], Parts[7]));
7488 }
7489
7490 EVT IdxVT = Idx.getValueType();
7491 unsigned NElem = VecVT.getVectorNumElements();
7492 assert(isPowerOf2_32(NElem));
7493 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7494 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7495 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7496 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7497 }
7498
7499 assert(VecSize <= 64);
7500
7501 MVT IntVT = MVT::getIntegerVT(VecSize);
7502
7503 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7504 SDValue VecBC = peekThroughBitcasts(Vec);
7505 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7506 SDValue Src = VecBC.getOperand(0);
7507 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7508 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7509 }
7510
7511 unsigned EltSize = EltVT.getSizeInBits();
7512 assert(isPowerOf2_32(EltSize));
7513
7514 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7515
7516 // Convert vector index to bit-index (* EltSize)
7517 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7518
7519 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7520 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7521
7522 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7523 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7524 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7525 }
7526
7527 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7528}
7529
7530static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7531 assert(Elt % 2 == 0);
7532 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7533}
7534
7535SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7536 SelectionDAG &DAG) const {
7537 SDLoc SL(Op);
7538 EVT ResultVT = Op.getValueType();
7539 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7540
7541 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7542 EVT EltVT = PackVT.getVectorElementType();
7543 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7544
7545 // vector_shuffle <0,1,6,7> lhs, rhs
7546 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7547 //
7548 // vector_shuffle <6,7,2,3> lhs, rhs
7549 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7550 //
7551 // vector_shuffle <6,7,0,1> lhs, rhs
7552 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7553
7554 // Avoid scalarizing when both halves are reading from consecutive elements.
7556 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7557 if (elementPairIsContiguous(SVN->getMask(), I)) {
7558 const int Idx = SVN->getMaskElt(I);
7559 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7560 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7561 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7562 SVN->getOperand(VecIdx),
7563 DAG.getConstant(EltIdx, SL, MVT::i32));
7564 Pieces.push_back(SubVec);
7565 } else {
7566 const int Idx0 = SVN->getMaskElt(I);
7567 const int Idx1 = SVN->getMaskElt(I + 1);
7568 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7569 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7570 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7571 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7572
7573 SDValue Vec0 = SVN->getOperand(VecIdx0);
7574 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7575 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7576
7577 SDValue Vec1 = SVN->getOperand(VecIdx1);
7578 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7579 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7580 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7581 }
7582 }
7583
7584 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7585}
7586
7587SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7588 SelectionDAG &DAG) const {
7589 SDValue SVal = Op.getOperand(0);
7590 EVT ResultVT = Op.getValueType();
7591 EVT SValVT = SVal.getValueType();
7592 SDValue UndefVal = DAG.getUNDEF(SValVT);
7593 SDLoc SL(Op);
7594
7596 VElts.push_back(SVal);
7597 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7598 VElts.push_back(UndefVal);
7599
7600 return DAG.getBuildVector(ResultVT, SL, VElts);
7601}
7602
7603SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7604 SelectionDAG &DAG) const {
7605 SDLoc SL(Op);
7606 EVT VT = Op.getValueType();
7607
7608 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7609 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7610
7611 SDValue Lo = Op.getOperand(0);
7612 SDValue Hi = Op.getOperand(1);
7613
7614 // Avoid adding defined bits with the zero_extend.
7615 if (Hi.isUndef()) {
7616 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7617 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7618 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7619 }
7620
7621 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7622 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7623
7624 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7625 DAG.getConstant(16, SL, MVT::i32));
7626 if (Lo.isUndef())
7627 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7628
7629 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7630 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7631
7632 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7633 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7634 }
7635
7636 // Split into 2-element chunks.
7637 const unsigned NumParts = VT.getVectorNumElements() / 2;
7639 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7640
7642 for (unsigned P = 0; P < NumParts; ++P) {
7643 SDValue Vec = DAG.getBuildVector(
7644 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7645 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7646 }
7647
7648 SDValue Blend =
7649 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7650 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7651}
7652
7654 const GlobalAddressSDNode *GA) const {
7655 // OSes that use ELF REL relocations (instead of RELA) can only store a
7656 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7657 // which can create arbitrary 64-bit addends. (This is only a problem for
7658 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7659 // the high 32 bits of the addend.)
7660 //
7661 // This should be kept in sync with how HasRelocationAddend is initialized in
7662 // the constructor of ELFAMDGPUAsmBackend.
7663 if (!Subtarget->isAmdHsaOS())
7664 return false;
7665
7666 // We can fold offsets for anything that doesn't require a GOT relocation.
7667 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7671}
7672
7673static SDValue
7675 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7676 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7677 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7678 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7679 // lowered to the following code sequence:
7680 //
7681 // For constant address space:
7682 // s_getpc_b64 s[0:1]
7683 // s_add_u32 s0, s0, $symbol
7684 // s_addc_u32 s1, s1, 0
7685 //
7686 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7687 // a fixup or relocation is emitted to replace $symbol with a literal
7688 // constant, which is a pc-relative offset from the encoding of the $symbol
7689 // operand to the global variable.
7690 //
7691 // For global address space:
7692 // s_getpc_b64 s[0:1]
7693 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7694 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7695 //
7696 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7697 // fixups or relocations are emitted to replace $symbol@*@lo and
7698 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7699 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7700 // operand to the global variable.
7701 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7702 SDValue PtrHi;
7703 if (GAFlags == SIInstrInfo::MO_NONE)
7704 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7705 else
7706 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7707 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7708}
7709
7710SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7711 SDValue Op,
7712 SelectionDAG &DAG) const {
7713 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7714 SDLoc DL(GSD);
7715 EVT PtrVT = Op.getValueType();
7716
7717 const GlobalValue *GV = GSD->getGlobal();
7723 GV->hasExternalLinkage()) {
7724 Type *Ty = GV->getValueType();
7725 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7726 // zero-sized type in other languages to declare the dynamic shared
7727 // memory which size is not known at the compile time. They will be
7728 // allocated by the runtime and placed directly after the static
7729 // allocated ones. They all share the same offset.
7730 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7731 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7732 // Adjust alignment for that dynamic shared memory array.
7734 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7735 MFI->setUsesDynamicLDS(true);
7736 return SDValue(
7737 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7738 }
7739 }
7741 }
7742
7744 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7746 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7747 }
7748
7749 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7750 SDValue AddrLo = DAG.getTargetGlobalAddress(
7751 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7752 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7753
7754 SDValue AddrHi = DAG.getTargetGlobalAddress(
7755 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7756 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7757
7758 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7759 }
7760
7761 if (shouldEmitFixup(GV))
7762 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7763
7764 if (shouldEmitPCReloc(GV))
7765 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7767
7768 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7770
7771 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7773 const DataLayout &DataLayout = DAG.getDataLayout();
7774 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7775 MachinePointerInfo PtrInfo =
7777
7778 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7781}
7782
7784 const SDLoc &DL, SDValue V) const {
7785 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7786 // the destination register.
7787 //
7788 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7789 // so we will end up with redundant moves to m0.
7790 //
7791 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7792
7793 // A Null SDValue creates a glue result.
7794 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7795 V, Chain);
7796 return SDValue(M0, 0);
7797}
7798
7799SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7800 MVT VT,
7801 unsigned Offset) const {
7802 SDLoc SL(Op);
7803 SDValue Param = lowerKernargMemParameter(
7804 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7805 // The local size values will have the hi 16-bits as zero.
7806 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7807 DAG.getValueType(VT));
7808}
7809
7811 EVT VT) {
7813 "non-hsa intrinsic with hsa target",
7814 DL.getDebugLoc());
7815 DAG.getContext()->diagnose(BadIntrin);
7816 return DAG.getUNDEF(VT);
7817}
7818
7820 EVT VT) {
7822 "intrinsic not supported on subtarget",
7823 DL.getDebugLoc());
7824 DAG.getContext()->diagnose(BadIntrin);
7825 return DAG.getUNDEF(VT);
7826}
7827
7829 ArrayRef<SDValue> Elts) {
7830 assert(!Elts.empty());
7831 MVT Type;
7832 unsigned NumElts = Elts.size();
7833
7834 if (NumElts <= 12) {
7835 Type = MVT::getVectorVT(MVT::f32, NumElts);
7836 } else {
7837 assert(Elts.size() <= 16);
7838 Type = MVT::v16f32;
7839 NumElts = 16;
7840 }
7841
7842 SmallVector<SDValue, 16> VecElts(NumElts);
7843 for (unsigned i = 0; i < Elts.size(); ++i) {
7844 SDValue Elt = Elts[i];
7845 if (Elt.getValueType() != MVT::f32)
7846 Elt = DAG.getBitcast(MVT::f32, Elt);
7847 VecElts[i] = Elt;
7848 }
7849 for (unsigned i = Elts.size(); i < NumElts; ++i)
7850 VecElts[i] = DAG.getUNDEF(MVT::f32);
7851
7852 if (NumElts == 1)
7853 return VecElts[0];
7854 return DAG.getBuildVector(Type, DL, VecElts);
7855}
7856
7857static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7858 SDValue Src, int ExtraElts) {
7859 EVT SrcVT = Src.getValueType();
7860
7862
7863 if (SrcVT.isVector())
7864 DAG.ExtractVectorElements(Src, Elts);
7865 else
7866 Elts.push_back(Src);
7867
7868 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7869 while (ExtraElts--)
7870 Elts.push_back(Undef);
7871
7872 return DAG.getBuildVector(CastVT, DL, Elts);
7873}
7874
7875// Re-construct the required return value for a image load intrinsic.
7876// This is more complicated due to the optional use TexFailCtrl which means the
7877// required return type is an aggregate
7879 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7880 bool Unpacked, bool IsD16, int DMaskPop,
7881 int NumVDataDwords, bool IsAtomicPacked16Bit,
7882 const SDLoc &DL) {
7883 // Determine the required return type. This is the same regardless of
7884 // IsTexFail flag
7885 EVT ReqRetVT = ResultTypes[0];
7886 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7887 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7888 ? (ReqRetNumElts + 1) / 2
7889 : ReqRetNumElts;
7890
7891 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7892
7893 MVT DataDwordVT =
7894 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7895
7896 MVT MaskPopVT =
7897 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7898
7899 SDValue Data(Result, 0);
7900 SDValue TexFail;
7901
7902 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7903 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7904 if (MaskPopVT.isVector()) {
7905 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7906 SDValue(Result, 0), ZeroIdx);
7907 } else {
7908 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7909 SDValue(Result, 0), ZeroIdx);
7910 }
7911 }
7912
7913 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7914 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7915 NumDataDwords - MaskPopDwords);
7916
7917 if (IsD16)
7918 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7919
7920 EVT LegalReqRetVT = ReqRetVT;
7921 if (!ReqRetVT.isVector()) {
7922 if (!Data.getValueType().isInteger())
7923 Data = DAG.getNode(ISD::BITCAST, DL,
7924 Data.getValueType().changeTypeToInteger(), Data);
7925 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7926 } else {
7927 // We need to widen the return vector to a legal type
7928 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7929 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7930 LegalReqRetVT =
7932 ReqRetVT.getVectorNumElements() + 1);
7933 }
7934 }
7935 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7936
7937 if (IsTexFail) {
7938 TexFail =
7939 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7940 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7941
7942 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7943 }
7944
7945 if (Result->getNumValues() == 1)
7946 return Data;
7947
7948 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7949}
7950
7951static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7952 SDValue *LWE, bool &IsTexFail) {
7953 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7954
7955 uint64_t Value = TexFailCtrlConst->getZExtValue();
7956 if (Value) {
7957 IsTexFail = true;
7958 }
7959
7960 SDLoc DL(TexFailCtrlConst);
7961 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7962 Value &= ~(uint64_t)0x1;
7963 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7964 Value &= ~(uint64_t)0x2;
7965
7966 return Value == 0;
7967}
7968
7970 MVT PackVectorVT,
7971 SmallVectorImpl<SDValue> &PackedAddrs,
7972 unsigned DimIdx, unsigned EndIdx,
7973 unsigned NumGradients) {
7974 SDLoc DL(Op);
7975 for (unsigned I = DimIdx; I < EndIdx; I++) {
7976 SDValue Addr = Op.getOperand(I);
7977
7978 // Gradients are packed with undef for each coordinate.
7979 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7980 // 1D: undef,dx/dh; undef,dx/dv
7981 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7982 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7983 if (((I + 1) >= EndIdx) ||
7984 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7985 I == DimIdx + NumGradients - 1))) {
7986 if (Addr.getValueType() != MVT::i16)
7987 Addr = DAG.getBitcast(MVT::i16, Addr);
7988 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7989 } else {
7990 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7991 I++;
7992 }
7993 Addr = DAG.getBitcast(MVT::f32, Addr);
7994 PackedAddrs.push_back(Addr);
7995 }
7996}
7997
7998SDValue SITargetLowering::lowerImage(SDValue Op,
8000 SelectionDAG &DAG, bool WithChain) const {
8001 SDLoc DL(Op);
8003 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8004 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8006 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8007 unsigned IntrOpcode = Intr->BaseOpcode;
8008 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8009 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8010 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8011
8012 SmallVector<EVT, 3> ResultTypes(Op->values());
8013 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8014 bool IsD16 = false;
8015 bool IsG16 = false;
8016 bool IsA16 = false;
8017 SDValue VData;
8018 int NumVDataDwords = 0;
8019 bool AdjustRetType = false;
8020 bool IsAtomicPacked16Bit = false;
8021
8022 // Offset of intrinsic arguments
8023 const unsigned ArgOffset = WithChain ? 2 : 1;
8024
8025 unsigned DMask;
8026 unsigned DMaskLanes = 0;
8027
8028 if (BaseOpcode->Atomic) {
8029 VData = Op.getOperand(2);
8030
8031 IsAtomicPacked16Bit =
8032 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8033 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8034
8035 bool Is64Bit = VData.getValueSizeInBits() == 64;
8036 if (BaseOpcode->AtomicX2) {
8037 SDValue VData2 = Op.getOperand(3);
8038 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8039 {VData, VData2});
8040 if (Is64Bit)
8041 VData = DAG.getBitcast(MVT::v4i32, VData);
8042
8043 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8044 DMask = Is64Bit ? 0xf : 0x3;
8045 NumVDataDwords = Is64Bit ? 4 : 2;
8046 } else {
8047 DMask = Is64Bit ? 0x3 : 0x1;
8048 NumVDataDwords = Is64Bit ? 2 : 1;
8049 }
8050 } else {
8051 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8052 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8053
8054 if (BaseOpcode->Store) {
8055 VData = Op.getOperand(2);
8056
8057 MVT StoreVT = VData.getSimpleValueType();
8058 if (StoreVT.getScalarType() == MVT::f16) {
8059 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8060 return Op; // D16 is unsupported for this instruction
8061
8062 IsD16 = true;
8063 VData = handleD16VData(VData, DAG, true);
8064 }
8065
8066 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8067 } else if (!BaseOpcode->NoReturn) {
8068 // Work out the num dwords based on the dmask popcount and underlying type
8069 // and whether packing is supported.
8070 MVT LoadVT = ResultTypes[0].getSimpleVT();
8071 if (LoadVT.getScalarType() == MVT::f16) {
8072 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8073 return Op; // D16 is unsupported for this instruction
8074
8075 IsD16 = true;
8076 }
8077
8078 // Confirm that the return type is large enough for the dmask specified
8079 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8080 (!LoadVT.isVector() && DMaskLanes > 1))
8081 return Op;
8082
8083 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8084 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8085 // instructions.
8086 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8087 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8088 NumVDataDwords = (DMaskLanes + 1) / 2;
8089 else
8090 NumVDataDwords = DMaskLanes;
8091
8092 AdjustRetType = true;
8093 }
8094 }
8095
8096 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8098
8099 // Check for 16 bit addresses or derivatives and pack if true.
8100 MVT VAddrVT =
8101 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8102 MVT VAddrScalarVT = VAddrVT.getScalarType();
8103 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8104 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8105
8106 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8107 VAddrScalarVT = VAddrVT.getScalarType();
8108 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8109 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8110
8111 // Push back extra arguments.
8112 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8113 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8114 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8115 // Special handling of bias when A16 is on. Bias is of type half but
8116 // occupies full 32-bit.
8117 SDValue Bias = DAG.getBuildVector(
8118 MVT::v2f16, DL,
8119 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8120 VAddrs.push_back(Bias);
8121 } else {
8122 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8123 "Bias needs to be converted to 16 bit in A16 mode");
8124 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8125 }
8126 }
8127
8128 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8129 // 16 bit gradients are supported, but are tied to the A16 control
8130 // so both gradients and addresses must be 16 bit
8131 LLVM_DEBUG(
8132 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8133 "require 16 bit args for both gradients and addresses");
8134 return Op;
8135 }
8136
8137 if (IsA16) {
8138 if (!ST->hasA16()) {
8139 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8140 "support 16 bit addresses\n");
8141 return Op;
8142 }
8143 }
8144
8145 // We've dealt with incorrect input so we know that if IsA16, IsG16
8146 // are set then we have to compress/pack operands (either address,
8147 // gradient or both)
8148 // In the case where a16 and gradients are tied (no G16 support) then we
8149 // have already verified that both IsA16 and IsG16 are true
8150 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8151 // Activate g16
8152 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8154 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8155 }
8156
8157 // Add gradients (packed or unpacked)
8158 if (IsG16) {
8159 // Pack the gradients
8160 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8161 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8162 ArgOffset + Intr->GradientStart,
8163 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8164 } else {
8165 for (unsigned I = ArgOffset + Intr->GradientStart;
8166 I < ArgOffset + Intr->CoordStart; I++)
8167 VAddrs.push_back(Op.getOperand(I));
8168 }
8169
8170 // Add addresses (packed or unpacked)
8171 if (IsA16) {
8172 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8173 ArgOffset + Intr->CoordStart, VAddrEnd,
8174 0 /* No gradients */);
8175 } else {
8176 // Add uncompressed address
8177 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8178 VAddrs.push_back(Op.getOperand(I));
8179 }
8180
8181 // If the register allocator cannot place the address registers contiguously
8182 // without introducing moves, then using the non-sequential address encoding
8183 // is always preferable, since it saves VALU instructions and is usually a
8184 // wash in terms of code size or even better.
8185 //
8186 // However, we currently have no way of hinting to the register allocator that
8187 // MIMG addresses should be placed contiguously when it is possible to do so,
8188 // so force non-NSA for the common 2-address case as a heuristic.
8189 //
8190 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8191 // allocation when possible.
8192 //
8193 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8194 // set of the remaining addresses.
8195 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8196 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8197 const bool UseNSA = ST->hasNSAEncoding() &&
8198 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8199 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8200 const bool UsePartialNSA =
8201 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8202
8203 SDValue VAddr;
8204 if (UsePartialNSA) {
8205 VAddr = getBuildDwordsVector(DAG, DL,
8206 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8207 } else if (!UseNSA) {
8208 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8209 }
8210
8211 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8212 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8213 SDValue Unorm;
8214 if (!BaseOpcode->Sampler) {
8215 Unorm = True;
8216 } else {
8217 uint64_t UnormConst =
8218 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8219
8220 Unorm = UnormConst ? True : False;
8221 }
8222
8223 SDValue TFE;
8224 SDValue LWE;
8225 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8226 bool IsTexFail = false;
8227 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8228 return Op;
8229
8230 if (IsTexFail) {
8231 if (!DMaskLanes) {
8232 // Expecting to get an error flag since TFC is on - and dmask is 0
8233 // Force dmask to be at least 1 otherwise the instruction will fail
8234 DMask = 0x1;
8235 DMaskLanes = 1;
8236 NumVDataDwords = 1;
8237 }
8238 NumVDataDwords += 1;
8239 AdjustRetType = true;
8240 }
8241
8242 // Has something earlier tagged that the return type needs adjusting
8243 // This happens if the instruction is a load or has set TexFailCtrl flags
8244 if (AdjustRetType) {
8245 // NumVDataDwords reflects the true number of dwords required in the return
8246 // type
8247 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8248 // This is a no-op load. This can be eliminated
8249 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8250 if (isa<MemSDNode>(Op))
8251 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8252 return Undef;
8253 }
8254
8255 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8256 MVT::i32, NumVDataDwords)
8257 : MVT::i32;
8258
8259 ResultTypes[0] = NewVT;
8260 if (ResultTypes.size() == 3) {
8261 // Original result was aggregate type used for TexFailCtrl results
8262 // The actual instruction returns as a vector type which has now been
8263 // created. Remove the aggregate result.
8264 ResultTypes.erase(&ResultTypes[1]);
8265 }
8266 }
8267
8268 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8269 if (BaseOpcode->Atomic)
8270 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8271 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8273 return Op;
8274
8276 if (BaseOpcode->Store || BaseOpcode->Atomic)
8277 Ops.push_back(VData); // vdata
8278 if (UsePartialNSA) {
8279 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8280 Ops.push_back(VAddr);
8281 } else if (UseNSA)
8282 append_range(Ops, VAddrs);
8283 else
8284 Ops.push_back(VAddr);
8285 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8286 EVT RsrcVT = Rsrc.getValueType();
8287 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8288 return Op;
8289 Ops.push_back(Rsrc);
8290 if (BaseOpcode->Sampler) {
8291 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8292 if (Samp.getValueType() != MVT::v4i32)
8293 return Op;
8294 Ops.push_back(Samp);
8295 }
8296 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8297 if (IsGFX10Plus)
8298 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8299 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8300 Ops.push_back(Unorm);
8301 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8302 Ops.push_back(IsA16 && // r128, a16 for gfx9
8303 ST->hasFeature(AMDGPU::FeatureR128A16)
8304 ? True
8305 : False);
8306 if (IsGFX10Plus)
8307 Ops.push_back(IsA16 ? True : False);
8308 if (!Subtarget->hasGFX90AInsts()) {
8309 Ops.push_back(TFE); // tfe
8310 } else if (TFE->getAsZExtVal()) {
8311 report_fatal_error("TFE is not supported on this GPU");
8312 }
8313 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8314 Ops.push_back(LWE); // lwe
8315 if (!IsGFX10Plus)
8316 Ops.push_back(DimInfo->DA ? True : False);
8317 if (BaseOpcode->HasD16)
8318 Ops.push_back(IsD16 ? True : False);
8319 if (isa<MemSDNode>(Op))
8320 Ops.push_back(Op.getOperand(0)); // chain
8321
8322 int NumVAddrDwords =
8323 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8324 int Opcode = -1;
8325
8326 if (IsGFX12Plus) {
8327 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8328 NumVDataDwords, NumVAddrDwords);
8329 } else if (IsGFX11Plus) {
8330 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8331 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8332 : AMDGPU::MIMGEncGfx11Default,
8333 NumVDataDwords, NumVAddrDwords);
8334 } else if (IsGFX10Plus) {
8335 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8336 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8337 : AMDGPU::MIMGEncGfx10Default,
8338 NumVDataDwords, NumVAddrDwords);
8339 } else {
8340 if (Subtarget->hasGFX90AInsts()) {
8341 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8342 NumVDataDwords, NumVAddrDwords);
8343 if (Opcode == -1)
8345 "requested image instruction is not supported on this GPU");
8346 }
8347 if (Opcode == -1 &&
8349 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8350 NumVDataDwords, NumVAddrDwords);
8351 if (Opcode == -1)
8352 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8353 NumVDataDwords, NumVAddrDwords);
8354 }
8355 if (Opcode == -1)
8356 return Op;
8357
8358 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8359 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8360 MachineMemOperand *MemRef = MemOp->getMemOperand();
8361 DAG.setNodeMemRefs(NewNode, {MemRef});
8362 }
8363
8364 if (BaseOpcode->AtomicX2) {
8366 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8367 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8368 }
8369 if (BaseOpcode->NoReturn)
8370 return SDValue(NewNode, 0);
8371 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8372 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8373 NumVDataDwords, IsAtomicPacked16Bit, DL);
8374}
8375
8376SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8377 SDValue Offset, SDValue CachePolicy,
8378 SelectionDAG &DAG) const {
8380
8381 const DataLayout &DataLayout = DAG.getDataLayout();
8382 Align Alignment =
8384
8389 VT.getStoreSize(), Alignment);
8390
8391 if (!Offset->isDivergent()) {
8392 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8393
8394 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8395 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8396 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8397 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8398 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8399 SDValue BufferLoad =
8401 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8402 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8403 }
8404
8405 // Widen vec3 load to vec4.
8406 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8407 !Subtarget->hasScalarDwordx3Loads()) {
8408 EVT WidenedVT =
8410 auto WidenedOp = DAG.getMemIntrinsicNode(
8411 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8412 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8413 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8414 DAG.getVectorIdxConstant(0, DL));
8415 return Subvector;
8416 }
8417
8419 DAG.getVTList(VT), Ops, VT, MMO);
8420 }
8421
8422 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8423 // assume that the buffer is unswizzled.
8424 SDValue Ops[] = {
8425 DAG.getEntryNode(), // Chain
8426 Rsrc, // rsrc
8427 DAG.getConstant(0, DL, MVT::i32), // vindex
8428 {}, // voffset
8429 {}, // soffset
8430 {}, // offset
8431 CachePolicy, // cachepolicy
8432 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8433 };
8434 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8435 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8436 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8437 }
8438
8440 unsigned NumLoads = 1;
8441 MVT LoadVT = VT.getSimpleVT();
8442 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8443 assert((LoadVT.getScalarType() == MVT::i32 ||
8444 LoadVT.getScalarType() == MVT::f32));
8445
8446 if (NumElts == 8 || NumElts == 16) {
8447 NumLoads = NumElts / 4;
8448 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8449 }
8450
8451 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8452
8453 // Use the alignment to ensure that the required offsets will fit into the
8454 // immediate offsets.
8455 setBufferOffsets(Offset, DAG, &Ops[3],
8456 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8457
8458 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8459 for (unsigned i = 0; i < NumLoads; ++i) {
8460 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8461 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8462 LoadVT, MMO, DAG));
8463 }
8464
8465 if (NumElts == 8 || NumElts == 16)
8466 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8467
8468 return Loads[0];
8469}
8470
8471SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8472 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8473 if (!Subtarget->hasArchitectedSGPRs())
8474 return {};
8475 SDLoc SL(Op);
8476 MVT VT = MVT::i32;
8477 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8478 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8479 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8480}
8481
8482SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8483 unsigned Dim,
8484 const ArgDescriptor &Arg) const {
8485 SDLoc SL(Op);
8487 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8488 if (MaxID == 0)
8489 return DAG.getConstant(0, SL, MVT::i32);
8490
8491 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8492 SDLoc(DAG.getEntryNode()), Arg);
8493
8494 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8495 // masking operations anyway.
8496 //
8497 // TODO: We could assert the top bit is 0 for the source copy.
8498 if (Arg.isMasked())
8499 return Val;
8500
8501 // Preserve the known bits after expansion to a copy.
8503 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8504 DAG.getValueType(SmallVT));
8505}
8506
8507SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8508 SelectionDAG &DAG) const {
8510 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8511
8512 EVT VT = Op.getValueType();
8513 SDLoc DL(Op);
8514 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8515
8516 // TODO: Should this propagate fast-math-flags?
8517
8518 switch (IntrinsicID) {
8519 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8520 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8521 return emitNonHSAIntrinsicError(DAG, DL, VT);
8522 return getPreloadedValue(DAG, *MFI, VT,
8524 }
8525 case Intrinsic::amdgcn_dispatch_ptr:
8526 case Intrinsic::amdgcn_queue_ptr: {
8527 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8528 DiagnosticInfoUnsupported BadIntrin(
8529 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8530 DL.getDebugLoc());
8531 DAG.getContext()->diagnose(BadIntrin);
8532 return DAG.getUNDEF(VT);
8533 }
8534
8535 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8538 return getPreloadedValue(DAG, *MFI, VT, RegID);
8539 }
8540 case Intrinsic::amdgcn_implicitarg_ptr: {
8541 if (MFI->isEntryFunction())
8542 return getImplicitArgPtr(DAG, DL);
8543 return getPreloadedValue(DAG, *MFI, VT,
8545 }
8546 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8548 // This only makes sense to call in a kernel, so just lower to null.
8549 return DAG.getConstant(0, DL, VT);
8550 }
8551
8552 return getPreloadedValue(DAG, *MFI, VT,
8554 }
8555 case Intrinsic::amdgcn_dispatch_id: {
8556 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8557 }
8558 case Intrinsic::amdgcn_rcp:
8559 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8560 case Intrinsic::amdgcn_rsq:
8561 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8562 case Intrinsic::amdgcn_rsq_legacy:
8564 return emitRemovedIntrinsicError(DAG, DL, VT);
8565 return SDValue();
8566 case Intrinsic::amdgcn_rcp_legacy:
8568 return emitRemovedIntrinsicError(DAG, DL, VT);
8569 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8570 case Intrinsic::amdgcn_rsq_clamp: {
8572 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8573
8574 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8577
8578 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8579 SDValue Tmp =
8580 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8581 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8582 DAG.getConstantFP(Min, DL, VT));
8583 }
8584 case Intrinsic::r600_read_ngroups_x:
8585 if (Subtarget->isAmdHsaOS())
8586 return emitNonHSAIntrinsicError(DAG, DL, VT);
8587
8588 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8590 false);
8591 case Intrinsic::r600_read_ngroups_y:
8592 if (Subtarget->isAmdHsaOS())
8593 return emitNonHSAIntrinsicError(DAG, DL, VT);
8594
8595 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8597 false);
8598 case Intrinsic::r600_read_ngroups_z:
8599 if (Subtarget->isAmdHsaOS())
8600 return emitNonHSAIntrinsicError(DAG, DL, VT);
8601
8602 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8604 false);
8605 case Intrinsic::r600_read_global_size_x:
8606 if (Subtarget->isAmdHsaOS())
8607 return emitNonHSAIntrinsicError(DAG, DL, VT);
8608
8609 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8611 Align(4), false);
8612 case Intrinsic::r600_read_global_size_y:
8613 if (Subtarget->isAmdHsaOS())
8614 return emitNonHSAIntrinsicError(DAG, DL, VT);
8615
8616 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8618 Align(4), false);
8619 case Intrinsic::r600_read_global_size_z:
8620 if (Subtarget->isAmdHsaOS())
8621 return emitNonHSAIntrinsicError(DAG, DL, VT);
8622
8623 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8625 Align(4), false);
8626 case Intrinsic::r600_read_local_size_x:
8627 if (Subtarget->isAmdHsaOS())
8628 return emitNonHSAIntrinsicError(DAG, DL, VT);
8629
8630 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8632 case Intrinsic::r600_read_local_size_y:
8633 if (Subtarget->isAmdHsaOS())
8634 return emitNonHSAIntrinsicError(DAG, DL, VT);
8635
8636 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8638 case Intrinsic::r600_read_local_size_z:
8639 if (Subtarget->isAmdHsaOS())
8640 return emitNonHSAIntrinsicError(DAG, DL, VT);
8641
8642 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8644 case Intrinsic::amdgcn_workgroup_id_x:
8645 return getPreloadedValue(DAG, *MFI, VT,
8647 case Intrinsic::amdgcn_workgroup_id_y:
8648 return getPreloadedValue(DAG, *MFI, VT,
8650 case Intrinsic::amdgcn_workgroup_id_z:
8651 return getPreloadedValue(DAG, *MFI, VT,
8653 case Intrinsic::amdgcn_wave_id:
8654 return lowerWaveID(DAG, Op);
8655 case Intrinsic::amdgcn_lds_kernel_id: {
8656 if (MFI->isEntryFunction())
8657 return getLDSKernelId(DAG, DL);
8658 return getPreloadedValue(DAG, *MFI, VT,
8660 }
8661 case Intrinsic::amdgcn_workitem_id_x:
8662 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8663 case Intrinsic::amdgcn_workitem_id_y:
8664 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8665 case Intrinsic::amdgcn_workitem_id_z:
8666 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8667 case Intrinsic::amdgcn_wavefrontsize:
8669 SDLoc(Op), MVT::i32);
8670 case Intrinsic::amdgcn_s_buffer_load: {
8671 unsigned CPol = Op.getConstantOperandVal(3);
8672 // s_buffer_load, because of how it's optimized, can't be volatile
8673 // so reject ones with the volatile bit set.
8674 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8677 return Op;
8678 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8679 Op.getOperand(3), DAG);
8680 }
8681 case Intrinsic::amdgcn_fdiv_fast:
8682 return lowerFDIV_FAST(Op, DAG);
8683 case Intrinsic::amdgcn_sin:
8684 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8685
8686 case Intrinsic::amdgcn_cos:
8687 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8688
8689 case Intrinsic::amdgcn_mul_u24:
8690 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8691 Op.getOperand(2));
8692 case Intrinsic::amdgcn_mul_i24:
8693 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8694 Op.getOperand(2));
8695
8696 case Intrinsic::amdgcn_log_clamp: {
8698 return SDValue();
8699
8700 return emitRemovedIntrinsicError(DAG, DL, VT);
8701 }
8702 case Intrinsic::amdgcn_fract:
8703 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8704
8705 case Intrinsic::amdgcn_class:
8706 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8707 Op.getOperand(2));
8708 case Intrinsic::amdgcn_div_fmas:
8709 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8710 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8711
8712 case Intrinsic::amdgcn_div_fixup:
8713 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8714 Op.getOperand(2), Op.getOperand(3));
8715
8716 case Intrinsic::amdgcn_div_scale: {
8717 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8718
8719 // Translate to the operands expected by the machine instruction. The
8720 // first parameter must be the same as the first instruction.
8721 SDValue Numerator = Op.getOperand(1);
8722 SDValue Denominator = Op.getOperand(2);
8723
8724 // Note this order is opposite of the machine instruction's operations,
8725 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8726 // intrinsic has the numerator as the first operand to match a normal
8727 // division operation.
8728
8729 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8730
8731 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8732 Denominator, Numerator);
8733 }
8734 case Intrinsic::amdgcn_icmp: {
8735 // There is a Pat that handles this variant, so return it as-is.
8736 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8737 Op.getConstantOperandVal(2) == 0 &&
8738 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8739 return Op;
8740 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8741 }
8742 case Intrinsic::amdgcn_fcmp: {
8743 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8744 }
8745 case Intrinsic::amdgcn_ballot:
8746 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8747 case Intrinsic::amdgcn_fmed3:
8748 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8749 Op.getOperand(2), Op.getOperand(3));
8750 case Intrinsic::amdgcn_fdot2:
8751 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8752 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8753 case Intrinsic::amdgcn_fmul_legacy:
8754 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8755 Op.getOperand(2));
8756 case Intrinsic::amdgcn_sffbh:
8757 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8758 case Intrinsic::amdgcn_sbfe:
8759 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8760 Op.getOperand(2), Op.getOperand(3));
8761 case Intrinsic::amdgcn_ubfe:
8762 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8763 Op.getOperand(2), Op.getOperand(3));
8764 case Intrinsic::amdgcn_cvt_pkrtz:
8765 case Intrinsic::amdgcn_cvt_pknorm_i16:
8766 case Intrinsic::amdgcn_cvt_pknorm_u16:
8767 case Intrinsic::amdgcn_cvt_pk_i16:
8768 case Intrinsic::amdgcn_cvt_pk_u16: {
8769 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8770 EVT VT = Op.getValueType();
8771 unsigned Opcode;
8772
8773 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8775 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8777 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8779 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8781 else
8783
8784 if (isTypeLegal(VT))
8785 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8786
8787 SDValue Node =
8788 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8789 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8790 }
8791 case Intrinsic::amdgcn_fmad_ftz:
8792 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8793 Op.getOperand(2), Op.getOperand(3));
8794
8795 case Intrinsic::amdgcn_if_break:
8796 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8797 Op->getOperand(1), Op->getOperand(2)),
8798 0);
8799
8800 case Intrinsic::amdgcn_groupstaticsize: {
8802 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8803 return Op;
8804
8805 const Module *M = MF.getFunction().getParent();
8806 const GlobalValue *GV =
8807 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8808 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8810 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8811 }
8812 case Intrinsic::amdgcn_is_shared:
8813 case Intrinsic::amdgcn_is_private: {
8814 SDLoc SL(Op);
8815 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8818 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8819 SDValue SrcVec =
8820 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8821
8822 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8823 DAG.getConstant(1, SL, MVT::i32));
8824 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8825 }
8826 case Intrinsic::amdgcn_perm:
8827 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8828 Op.getOperand(2), Op.getOperand(3));
8829 case Intrinsic::amdgcn_reloc_constant: {
8830 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8831 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8832 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8833 auto *RelocSymbol = cast<GlobalVariable>(
8834 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8835 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8837 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8838 }
8839 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8840 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8841 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8842 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8843 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8844 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8845 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8846 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8847 if (Op.getOperand(4).getValueType() == MVT::i32)
8848 return SDValue();
8849
8850 SDLoc SL(Op);
8851 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8852 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8853 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8854 Op.getOperand(3), IndexKeyi32);
8855 }
8856 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8857 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8858 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8859 if (Op.getOperand(6).getValueType() == MVT::i32)
8860 return SDValue();
8861
8862 SDLoc SL(Op);
8863 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8864 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8865 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8866 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8867 IndexKeyi32, Op.getOperand(7)});
8868 }
8869 case Intrinsic::amdgcn_addrspacecast_nonnull:
8870 return lowerADDRSPACECAST(Op, DAG);
8871 case Intrinsic::amdgcn_readlane:
8872 case Intrinsic::amdgcn_readfirstlane:
8873 case Intrinsic::amdgcn_writelane:
8874 case Intrinsic::amdgcn_permlane16:
8875 case Intrinsic::amdgcn_permlanex16:
8876 case Intrinsic::amdgcn_permlane64:
8877 case Intrinsic::amdgcn_set_inactive:
8878 case Intrinsic::amdgcn_set_inactive_chain_arg:
8879 case Intrinsic::amdgcn_mov_dpp8:
8880 case Intrinsic::amdgcn_update_dpp:
8881 return lowerLaneOp(*this, Op.getNode(), DAG);
8882 default:
8883 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8885 return lowerImage(Op, ImageDimIntr, DAG, false);
8886
8887 return Op;
8888 }
8889}
8890
8891// On targets not supporting constant in soffset field, turn zero to
8892// SGPR_NULL to avoid generating an extra s_mov with zero.
8894 const GCNSubtarget *Subtarget) {
8895 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8896 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8897 return SOffset;
8898}
8899
8900SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8901 SelectionDAG &DAG,
8902 unsigned NewOpcode) const {
8903 SDLoc DL(Op);
8904
8905 SDValue VData = Op.getOperand(2);
8906 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8907 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
8908 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8909 SDValue Ops[] = {
8910 Op.getOperand(0), // Chain
8911 VData, // vdata
8912 Rsrc, // rsrc
8913 DAG.getConstant(0, DL, MVT::i32), // vindex
8914 VOffset, // voffset
8915 SOffset, // soffset
8916 Offset, // offset
8917 Op.getOperand(6), // cachepolicy
8918 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8919 };
8920
8921 auto *M = cast<MemSDNode>(Op);
8922
8923 EVT MemVT = VData.getValueType();
8924 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8925 M->getMemOperand());
8926}
8927
8928SDValue
8929SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8930 unsigned NewOpcode) const {
8931 SDLoc DL(Op);
8932
8933 SDValue VData = Op.getOperand(2);
8934 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8935 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
8936 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8937 SDValue Ops[] = {
8938 Op.getOperand(0), // Chain
8939 VData, // vdata
8940 Rsrc, // rsrc
8941 Op.getOperand(4), // vindex
8942 VOffset, // voffset
8943 SOffset, // soffset
8944 Offset, // offset
8945 Op.getOperand(7), // cachepolicy
8946 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8947 };
8948
8949 auto *M = cast<MemSDNode>(Op);
8950
8951 EVT MemVT = VData.getValueType();
8952 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8953 M->getMemOperand());
8954}
8955
8956SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8957 SelectionDAG &DAG) const {
8958 unsigned IntrID = Op.getConstantOperandVal(1);
8959 SDLoc DL(Op);
8960
8961 switch (IntrID) {
8962 case Intrinsic::amdgcn_ds_ordered_add:
8963 case Intrinsic::amdgcn_ds_ordered_swap: {
8964 MemSDNode *M = cast<MemSDNode>(Op);
8965 SDValue Chain = M->getOperand(0);
8966 SDValue M0 = M->getOperand(2);
8967 SDValue Value = M->getOperand(3);
8968 unsigned IndexOperand = M->getConstantOperandVal(7);
8969 unsigned WaveRelease = M->getConstantOperandVal(8);
8970 unsigned WaveDone = M->getConstantOperandVal(9);
8971
8972 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8973 IndexOperand &= ~0x3f;
8974 unsigned CountDw = 0;
8975
8976 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8977 CountDw = (IndexOperand >> 24) & 0xf;
8978 IndexOperand &= ~(0xf << 24);
8979
8980 if (CountDw < 1 || CountDw > 4) {
8982 "ds_ordered_count: dword count must be between 1 and 4");
8983 }
8984 }
8985
8986 if (IndexOperand)
8987 report_fatal_error("ds_ordered_count: bad index operand");
8988
8989 if (WaveDone && !WaveRelease)
8990 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8991
8992 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8993 unsigned ShaderType =
8995 unsigned Offset0 = OrderedCountIndex << 2;
8996 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8997
8998 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8999 Offset1 |= (CountDw - 1) << 6;
9000
9001 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9002 Offset1 |= ShaderType << 2;
9003
9004 unsigned Offset = Offset0 | (Offset1 << 8);
9005
9006 SDValue Ops[] = {
9007 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9008 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9009 };
9011 M->getVTList(), Ops, M->getMemoryVT(),
9012 M->getMemOperand());
9013 }
9014 case Intrinsic::amdgcn_raw_buffer_load:
9015 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9016 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9017 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9018 case Intrinsic::amdgcn_raw_buffer_load_format:
9019 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9020 const bool IsFormat =
9021 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9022 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9023
9024 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9025 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9026 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9027 SDValue Ops[] = {
9028 Op.getOperand(0), // Chain
9029 Rsrc, // rsrc
9030 DAG.getConstant(0, DL, MVT::i32), // vindex
9031 VOffset, // voffset
9032 SOffset, // soffset
9033 Offset, // offset
9034 Op.getOperand(5), // cachepolicy, swizzled buffer
9035 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9036 };
9037
9038 auto *M = cast<MemSDNode>(Op);
9039 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9040 }
9041 case Intrinsic::amdgcn_struct_buffer_load:
9042 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9043 case Intrinsic::amdgcn_struct_buffer_load_format:
9044 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9045 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9046 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9047 const bool IsFormat =
9048 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9049 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9050
9051 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9052 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9053 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9054 SDValue Ops[] = {
9055 Op.getOperand(0), // Chain
9056 Rsrc, // rsrc
9057 Op.getOperand(3), // vindex
9058 VOffset, // voffset
9059 SOffset, // soffset
9060 Offset, // offset
9061 Op.getOperand(6), // cachepolicy, swizzled buffer
9062 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9063 };
9064
9065 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9066 }
9067 case Intrinsic::amdgcn_raw_tbuffer_load:
9068 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9069 MemSDNode *M = cast<MemSDNode>(Op);
9070 EVT LoadVT = Op.getValueType();
9071 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9072 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9073 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9074
9075 SDValue Ops[] = {
9076 Op.getOperand(0), // Chain
9077 Rsrc, // rsrc
9078 DAG.getConstant(0, DL, MVT::i32), // vindex
9079 VOffset, // voffset
9080 SOffset, // soffset
9081 Offset, // offset
9082 Op.getOperand(5), // format
9083 Op.getOperand(6), // cachepolicy, swizzled buffer
9084 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9085 };
9086
9087 if (LoadVT.getScalarType() == MVT::f16)
9088 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9089 Ops);
9090 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9091 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9092 DAG);
9093 }
9094 case Intrinsic::amdgcn_struct_tbuffer_load:
9095 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9096 MemSDNode *M = cast<MemSDNode>(Op);
9097 EVT LoadVT = Op.getValueType();
9098 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9099 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9100 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9101
9102 SDValue Ops[] = {
9103 Op.getOperand(0), // Chain
9104 Rsrc, // rsrc
9105 Op.getOperand(3), // vindex
9106 VOffset, // voffset
9107 SOffset, // soffset
9108 Offset, // offset
9109 Op.getOperand(6), // format
9110 Op.getOperand(7), // cachepolicy, swizzled buffer
9111 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9112 };
9113
9114 if (LoadVT.getScalarType() == MVT::f16)
9115 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9116 Ops);
9117 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9118 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9119 DAG);
9120 }
9121 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9122 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9123 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9124 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9125 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9126 return lowerStructBufferAtomicIntrin(Op, DAG,
9128 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9129 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9130 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9131 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9132 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9133 return lowerStructBufferAtomicIntrin(Op, DAG,
9135 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9137 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9138 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9139 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9140 return lowerStructBufferAtomicIntrin(Op, DAG,
9142 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9143 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9144 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9145 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9147 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9148 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9150 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9151 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9152 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9153 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9154 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9155 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9156 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9157 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9158 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9159 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9160 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9162 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9163 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9164 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9165 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9166 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9167 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9168 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9169 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9170 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9171 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9172 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9173 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9174 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9175 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9176 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9177 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9178 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9179 return lowerRawBufferAtomicIntrin(Op, DAG,
9181 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9182 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9183 return lowerStructBufferAtomicIntrin(Op, DAG,
9185 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9186 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9187 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9188 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9189 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9190 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9191 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9192 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9193 return lowerStructBufferAtomicIntrin(Op, DAG,
9195 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9196 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9197 return lowerStructBufferAtomicIntrin(Op, DAG,
9199 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9200 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9201 return lowerStructBufferAtomicIntrin(Op, DAG,
9203 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9204 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9205 return lowerStructBufferAtomicIntrin(Op, DAG,
9207 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9209 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9210 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9211 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9212 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9213 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9214 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9215 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9216 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9217 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9218 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9219 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9220 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9221 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9222 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9223 return lowerStructBufferAtomicIntrin(Op, DAG,
9225
9226 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9227 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9228 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9229 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9230 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9231 SDValue Ops[] = {
9232 Op.getOperand(0), // Chain
9233 Op.getOperand(2), // src
9234 Op.getOperand(3), // cmp
9235 Rsrc, // rsrc
9236 DAG.getConstant(0, DL, MVT::i32), // vindex
9237 VOffset, // voffset
9238 SOffset, // soffset
9239 Offset, // offset
9240 Op.getOperand(7), // cachepolicy
9241 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9242 };
9243 EVT VT = Op.getValueType();
9244 auto *M = cast<MemSDNode>(Op);
9245
9247 Op->getVTList(), Ops, VT,
9248 M->getMemOperand());
9249 }
9250 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9251 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9252 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9253 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9254 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9255 SDValue Ops[] = {
9256 Op.getOperand(0), // Chain
9257 Op.getOperand(2), // src
9258 Op.getOperand(3), // cmp
9259 Rsrc, // rsrc
9260 Op.getOperand(5), // vindex
9261 VOffset, // voffset
9262 SOffset, // soffset
9263 Offset, // offset
9264 Op.getOperand(8), // cachepolicy
9265 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9266 };
9267 EVT VT = Op.getValueType();
9268 auto *M = cast<MemSDNode>(Op);
9269
9271 Op->getVTList(), Ops, VT,
9272 M->getMemOperand());
9273 }
9274 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9275 MemSDNode *M = cast<MemSDNode>(Op);
9276 SDValue NodePtr = M->getOperand(2);
9277 SDValue RayExtent = M->getOperand(3);
9278 SDValue RayOrigin = M->getOperand(4);
9279 SDValue RayDir = M->getOperand(5);
9280 SDValue RayInvDir = M->getOperand(6);
9281 SDValue TDescr = M->getOperand(7);
9282
9283 assert(NodePtr.getValueType() == MVT::i32 ||
9284 NodePtr.getValueType() == MVT::i64);
9285 assert(RayDir.getValueType() == MVT::v3f16 ||
9286 RayDir.getValueType() == MVT::v3f32);
9287
9288 if (!Subtarget->hasGFX10_AEncoding()) {
9289 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9290 return SDValue();
9291 }
9292
9293 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9294 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9295 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9296 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9297 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9298 const unsigned NumVDataDwords = 4;
9299 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9300 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9301 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9302 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9303 IsGFX12Plus;
9304 const unsigned BaseOpcodes[2][2] = {
9305 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9306 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9307 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9308 int Opcode;
9309 if (UseNSA) {
9310 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9311 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9312 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9313 : AMDGPU::MIMGEncGfx10NSA,
9314 NumVDataDwords, NumVAddrDwords);
9315 } else {
9316 assert(!IsGFX12Plus);
9317 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9318 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9319 : AMDGPU::MIMGEncGfx10Default,
9320 NumVDataDwords, NumVAddrDwords);
9321 }
9322 assert(Opcode != -1);
9323
9325
9326 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9328 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9329 if (Lanes[0].getValueSizeInBits() == 32) {
9330 for (unsigned I = 0; I < 3; ++I)
9331 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9332 } else {
9333 if (IsAligned) {
9334 Ops.push_back(DAG.getBitcast(
9335 MVT::i32,
9336 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9337 Ops.push_back(Lanes[2]);
9338 } else {
9339 SDValue Elt0 = Ops.pop_back_val();
9340 Ops.push_back(DAG.getBitcast(
9341 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9342 Ops.push_back(DAG.getBitcast(
9343 MVT::i32,
9344 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9345 }
9346 }
9347 };
9348
9349 if (UseNSA && IsGFX11Plus) {
9350 Ops.push_back(NodePtr);
9351 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9352 Ops.push_back(RayOrigin);
9353 if (IsA16) {
9354 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9355 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9356 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9357 for (unsigned I = 0; I < 3; ++I) {
9358 MergedLanes.push_back(DAG.getBitcast(
9359 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9360 {DirLanes[I], InvDirLanes[I]})));
9361 }
9362 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9363 } else {
9364 Ops.push_back(RayDir);
9365 Ops.push_back(RayInvDir);
9366 }
9367 } else {
9368 if (Is64)
9369 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9370 2);
9371 else
9372 Ops.push_back(NodePtr);
9373
9374 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9375 packLanes(RayOrigin, true);
9376 packLanes(RayDir, true);
9377 packLanes(RayInvDir, false);
9378 }
9379
9380 if (!UseNSA) {
9381 // Build a single vector containing all the operands so far prepared.
9382 if (NumVAddrDwords > 12) {
9383 SDValue Undef = DAG.getUNDEF(MVT::i32);
9384 Ops.append(16 - Ops.size(), Undef);
9385 }
9386 assert(Ops.size() >= 8 && Ops.size() <= 12);
9387 SDValue MergedOps =
9388 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9389 Ops.clear();
9390 Ops.push_back(MergedOps);
9391 }
9392
9393 Ops.push_back(TDescr);
9394 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9395 Ops.push_back(M->getChain());
9396
9397 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9398 MachineMemOperand *MemRef = M->getMemOperand();
9399 DAG.setNodeMemRefs(NewNode, {MemRef});
9400 return SDValue(NewNode, 0);
9401 }
9402 case Intrinsic::amdgcn_global_atomic_fmin_num:
9403 case Intrinsic::amdgcn_global_atomic_fmax_num:
9404 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9405 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9406 MemSDNode *M = cast<MemSDNode>(Op);
9407 SDValue Ops[] = {
9408 M->getOperand(0), // Chain
9409 M->getOperand(2), // Ptr
9410 M->getOperand(3) // Value
9411 };
9412 unsigned Opcode = 0;
9413 switch (IntrID) {
9414 case Intrinsic::amdgcn_global_atomic_fmin_num:
9415 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9416 Opcode = ISD::ATOMIC_LOAD_FMIN;
9417 break;
9418 }
9419 case Intrinsic::amdgcn_global_atomic_fmax_num:
9420 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9421 Opcode = ISD::ATOMIC_LOAD_FMAX;
9422 break;
9423 }
9424 default:
9425 llvm_unreachable("unhandled atomic opcode");
9426 }
9427 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9428 Ops, M->getMemOperand());
9429 }
9430 case Intrinsic::amdgcn_s_get_barrier_state:
9431 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9432 SDValue Chain = Op->getOperand(0);
9434 unsigned Opc;
9435
9436 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9437 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9438 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9439 BarID = (BarID >> 4) & 0x3F;
9440 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9441 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9442 Ops.push_back(K);
9443 Ops.push_back(Chain);
9444 } else {
9445 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9446 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9447 SDValue M0Val;
9448 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9449 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9450 M0Val = SDValue(
9451 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9452 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9453 0);
9454 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9455 } else
9456 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9457 }
9458
9459 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9460 return SDValue(NewMI, 0);
9461 }
9462 default:
9463
9464 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9466 return lowerImage(Op, ImageDimIntr, DAG, true);
9467
9468 return SDValue();
9469 }
9470}
9471
9472// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9473// dwordx4 if on SI and handle TFE loads.
9474SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9475 SDVTList VTList,
9476 ArrayRef<SDValue> Ops, EVT MemVT,
9477 MachineMemOperand *MMO,
9478 SelectionDAG &DAG) const {
9479 LLVMContext &C = *DAG.getContext();
9481 EVT VT = VTList.VTs[0];
9482
9483 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9484 bool IsTFE = VTList.NumVTs == 3;
9485 if (IsTFE) {
9486 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9487 unsigned NumOpDWords = NumValueDWords + 1;
9488 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9489 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9490 MachineMemOperand *OpDWordsMMO =
9491 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9492 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9493 OpDWordsVT, OpDWordsMMO, DAG);
9495 DAG.getVectorIdxConstant(NumValueDWords, DL));
9496 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9497 SDValue ValueDWords =
9498 NumValueDWords == 1
9499 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9501 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9502 ZeroIdx);
9503 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9504 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9505 }
9506
9507 if (!Subtarget->hasDwordx3LoadStores() &&
9508 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9509 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9510 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9511 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9512 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9513 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9514 WidenedMemVT, WidenedMMO);
9516 DAG.getVectorIdxConstant(0, DL));
9517 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9518 }
9519
9520 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9521}
9522
9523SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9524 bool ImageStore) const {
9525 EVT StoreVT = VData.getValueType();
9526
9527 // No change for f16 and legal vector D16 types.
9528 if (!StoreVT.isVector())
9529 return VData;
9530
9531 SDLoc DL(VData);
9532 unsigned NumElements = StoreVT.getVectorNumElements();
9533
9534 if (Subtarget->hasUnpackedD16VMem()) {
9535 // We need to unpack the packed data to store.
9536 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9537 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9538
9539 EVT EquivStoreVT =
9540 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9541 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9542 return DAG.UnrollVectorOp(ZExt.getNode());
9543 }
9544
9545 // The sq block of gfx8.1 does not estimate register use correctly for d16
9546 // image store instructions. The data operand is computed as if it were not a
9547 // d16 image instruction.
9548 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9549 // Bitcast to i16
9550 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9551 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9552
9553 // Decompose into scalars
9555 DAG.ExtractVectorElements(IntVData, Elts);
9556
9557 // Group pairs of i16 into v2i16 and bitcast to i32
9558 SmallVector<SDValue, 4> PackedElts;
9559 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9560 SDValue Pair =
9561 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9562 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9563 PackedElts.push_back(IntPair);
9564 }
9565 if ((NumElements % 2) == 1) {
9566 // Handle v3i16
9567 unsigned I = Elts.size() / 2;
9568 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9569 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9570 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9571 PackedElts.push_back(IntPair);
9572 }
9573
9574 // Pad using UNDEF
9575 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9576
9577 // Build final vector
9578 EVT VecVT =
9579 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9580 return DAG.getBuildVector(VecVT, DL, PackedElts);
9581 }
9582
9583 if (NumElements == 3) {
9584 EVT IntStoreVT =
9586 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9587
9588 EVT WidenedStoreVT = EVT::getVectorVT(
9589 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9590 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9591 WidenedStoreVT.getStoreSizeInBits());
9592 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9593 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9594 }
9595
9596 assert(isTypeLegal(StoreVT));
9597 return VData;
9598}
9599
9600SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9601 SelectionDAG &DAG) const {
9602 SDLoc DL(Op);
9603 SDValue Chain = Op.getOperand(0);
9604 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9606
9607 switch (IntrinsicID) {
9608 case Intrinsic::amdgcn_exp_compr: {
9609 if (!Subtarget->hasCompressedExport()) {
9610 DiagnosticInfoUnsupported BadIntrin(
9612 "intrinsic not supported on subtarget", DL.getDebugLoc());
9613 DAG.getContext()->diagnose(BadIntrin);
9614 }
9615 SDValue Src0 = Op.getOperand(4);
9616 SDValue Src1 = Op.getOperand(5);
9617 // Hack around illegal type on SI by directly selecting it.
9618 if (isTypeLegal(Src0.getValueType()))
9619 return SDValue();
9620
9621 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9622 SDValue Undef = DAG.getUNDEF(MVT::f32);
9623 const SDValue Ops[] = {
9624 Op.getOperand(2), // tgt
9625 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9626 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9627 Undef, // src2
9628 Undef, // src3
9629 Op.getOperand(7), // vm
9630 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9631 Op.getOperand(3), // en
9632 Op.getOperand(0) // Chain
9633 };
9634
9635 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9636 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9637 }
9638 case Intrinsic::amdgcn_s_barrier:
9639 case Intrinsic::amdgcn_s_barrier_signal:
9640 case Intrinsic::amdgcn_s_barrier_wait: {
9643 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9644 if (WGSize <= ST.getWavefrontSize()) {
9645 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9646 // s_barrier/s_barrier_wait to wave_barrier.
9647 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9648 return Op.getOperand(0);
9649 else
9650 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9651 MVT::Other, Op.getOperand(0)),
9652 0);
9653 }
9654 }
9655
9656 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9657 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9658 SDValue K =
9660 SDValue BarSignal =
9661 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9662 MVT::Other, K, Op.getOperand(0)),
9663 0);
9664 SDValue BarWait =
9665 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9666 BarSignal.getValue(0)),
9667 0);
9668 return BarWait;
9669 }
9670
9671 return SDValue();
9672 };
9673
9674 case Intrinsic::amdgcn_struct_tbuffer_store:
9675 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9676 SDValue VData = Op.getOperand(2);
9677 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9678 if (IsD16)
9679 VData = handleD16VData(VData, DAG);
9680 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9681 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9682 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9683 SDValue Ops[] = {
9684 Chain,
9685 VData, // vdata
9686 Rsrc, // rsrc
9687 Op.getOperand(4), // vindex
9688 VOffset, // voffset
9689 SOffset, // soffset
9690 Offset, // offset
9691 Op.getOperand(7), // format
9692 Op.getOperand(8), // cachepolicy, swizzled buffer
9693 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9694 };
9695 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9697 MemSDNode *M = cast<MemSDNode>(Op);
9698 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9699 M->getMemoryVT(), M->getMemOperand());
9700 }
9701
9702 case Intrinsic::amdgcn_raw_tbuffer_store:
9703 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9704 SDValue VData = Op.getOperand(2);
9705 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9706 if (IsD16)
9707 VData = handleD16VData(VData, DAG);
9708 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9709 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9710 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9711 SDValue Ops[] = {
9712 Chain,
9713 VData, // vdata
9714 Rsrc, // rsrc
9715 DAG.getConstant(0, DL, MVT::i32), // vindex
9716 VOffset, // voffset
9717 SOffset, // soffset
9718 Offset, // offset
9719 Op.getOperand(6), // format
9720 Op.getOperand(7), // cachepolicy, swizzled buffer
9721 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9722 };
9723 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9725 MemSDNode *M = cast<MemSDNode>(Op);
9726 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9727 M->getMemoryVT(), M->getMemOperand());
9728 }
9729
9730 case Intrinsic::amdgcn_raw_buffer_store:
9731 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9732 case Intrinsic::amdgcn_raw_buffer_store_format:
9733 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9734 const bool IsFormat =
9735 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9736 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9737
9738 SDValue VData = Op.getOperand(2);
9739 EVT VDataVT = VData.getValueType();
9740 EVT EltType = VDataVT.getScalarType();
9741 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9742 if (IsD16) {
9743 VData = handleD16VData(VData, DAG);
9744 VDataVT = VData.getValueType();
9745 }
9746
9747 if (!isTypeLegal(VDataVT)) {
9748 VData =
9749 DAG.getNode(ISD::BITCAST, DL,
9750 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9751 }
9752
9753 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9754 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9755 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9756 SDValue Ops[] = {
9757 Chain,
9758 VData,
9759 Rsrc,
9760 DAG.getConstant(0, DL, MVT::i32), // vindex
9761 VOffset, // voffset
9762 SOffset, // soffset
9763 Offset, // offset
9764 Op.getOperand(6), // cachepolicy, swizzled buffer
9765 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9766 };
9767 unsigned Opc =
9769 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9770 MemSDNode *M = cast<MemSDNode>(Op);
9771
9772 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9773 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9774 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9775
9776 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9777 M->getMemoryVT(), M->getMemOperand());
9778 }
9779
9780 case Intrinsic::amdgcn_struct_buffer_store:
9781 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9782 case Intrinsic::amdgcn_struct_buffer_store_format:
9783 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9784 const bool IsFormat =
9785 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9786 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9787
9788 SDValue VData = Op.getOperand(2);
9789 EVT VDataVT = VData.getValueType();
9790 EVT EltType = VDataVT.getScalarType();
9791 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9792
9793 if (IsD16) {
9794 VData = handleD16VData(VData, DAG);
9795 VDataVT = VData.getValueType();
9796 }
9797
9798 if (!isTypeLegal(VDataVT)) {
9799 VData =
9800 DAG.getNode(ISD::BITCAST, DL,
9801 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9802 }
9803
9804 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9805 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9806 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9807 SDValue Ops[] = {
9808 Chain,
9809 VData,
9810 Rsrc,
9811 Op.getOperand(4), // vindex
9812 VOffset, // voffset
9813 SOffset, // soffset
9814 Offset, // offset
9815 Op.getOperand(7), // cachepolicy, swizzled buffer
9816 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9817 };
9818 unsigned Opc =
9820 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9821 MemSDNode *M = cast<MemSDNode>(Op);
9822
9823 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9824 EVT VDataType = VData.getValueType().getScalarType();
9825 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9826 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9827
9828 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9829 M->getMemoryVT(), M->getMemOperand());
9830 }
9831 case Intrinsic::amdgcn_raw_buffer_load_lds:
9832 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9833 case Intrinsic::amdgcn_struct_buffer_load_lds:
9834 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9835 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9836 unsigned Opc;
9837 bool HasVIndex =
9838 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9839 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9840 unsigned OpOffset = HasVIndex ? 1 : 0;
9841 SDValue VOffset = Op.getOperand(5 + OpOffset);
9842 bool HasVOffset = !isNullConstant(VOffset);
9843 unsigned Size = Op->getConstantOperandVal(4);
9844
9845 switch (Size) {
9846 default:
9847 return SDValue();
9848 case 1:
9849 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9850 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9851 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9852 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9853 break;
9854 case 2:
9855 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9856 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9857 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9858 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9859 break;
9860 case 4:
9861 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9862 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9863 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9864 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9865 break;
9866 case 12:
9867 if (!Subtarget->hasLDSLoadB96_B128())
9868 return SDValue();
9869 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9870 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9871 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9872 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9873 break;
9874 case 16:
9875 if (!Subtarget->hasLDSLoadB96_B128())
9876 return SDValue();
9877 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9878 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9879 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9880 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9881 break;
9882 }
9883
9884 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9885
9887
9888 if (HasVIndex && HasVOffset)
9889 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9890 {Op.getOperand(5), // VIndex
9891 VOffset}));
9892 else if (HasVIndex)
9893 Ops.push_back(Op.getOperand(5));
9894 else if (HasVOffset)
9895 Ops.push_back(VOffset);
9896
9897 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9898 Ops.push_back(Rsrc);
9899 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9900 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9901 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9902 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9904 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9905 DL, MVT::i8)); // cpol
9907 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9908 ? 1
9909 : 0,
9910 DL, MVT::i8)); // swz
9911 Ops.push_back(M0Val.getValue(0)); // Chain
9912 Ops.push_back(M0Val.getValue(1)); // Glue
9913
9914 auto *M = cast<MemSDNode>(Op);
9915 MachineMemOperand *LoadMMO = M->getMemOperand();
9916 // Don't set the offset value here because the pointer points to the base of
9917 // the buffer.
9918 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9919
9920 MachinePointerInfo StorePtrI = LoadPtrI;
9921 LoadPtrI.V = PoisonValue::get(
9925
9926 auto F = LoadMMO->getFlags() &
9928 LoadMMO =
9930 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9931
9933 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9934 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9935
9936 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9937 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9938
9939 return SDValue(Load, 0);
9940 }
9941 case Intrinsic::amdgcn_global_load_lds: {
9942 unsigned Opc;
9943 unsigned Size = Op->getConstantOperandVal(4);
9944 switch (Size) {
9945 default:
9946 return SDValue();
9947 case 1:
9948 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9949 break;
9950 case 2:
9951 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9952 break;
9953 case 4:
9954 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9955 break;
9956 case 12:
9957 if (!Subtarget->hasLDSLoadB96_B128())
9958 return SDValue();
9959 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
9960 break;
9961 case 16:
9962 if (!Subtarget->hasLDSLoadB96_B128())
9963 return SDValue();
9964 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
9965 break;
9966 }
9967
9968 auto *M = cast<MemSDNode>(Op);
9969 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9970
9972
9973 SDValue Addr = Op.getOperand(2); // Global ptr
9974 SDValue VOffset;
9975 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9976 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9977 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9978 SDValue LHS = Addr.getOperand(0);
9979 SDValue RHS = Addr.getOperand(1);
9980
9981 if (LHS->isDivergent())
9982 std::swap(LHS, RHS);
9983
9984 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9985 RHS.getOperand(0).getValueType() == MVT::i32) {
9986 // add (i64 sgpr), (zero_extend (i32 vgpr))
9987 Addr = LHS;
9988 VOffset = RHS.getOperand(0);
9989 }
9990 }
9991
9992 Ops.push_back(Addr);
9993 if (!Addr->isDivergent()) {
9994 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9995 if (!VOffset)
9996 VOffset =
9997 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9998 DAG.getTargetConstant(0, DL, MVT::i32)),
9999 0);
10000 Ops.push_back(VOffset);
10001 }
10002
10003 Ops.push_back(Op.getOperand(5)); // Offset
10004 Ops.push_back(Op.getOperand(6)); // CPol
10005 Ops.push_back(M0Val.getValue(0)); // Chain
10006 Ops.push_back(M0Val.getValue(1)); // Glue
10007
10008 MachineMemOperand *LoadMMO = M->getMemOperand();
10009 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10010 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10011 MachinePointerInfo StorePtrI = LoadPtrI;
10012 LoadPtrI.V = PoisonValue::get(
10016 auto F = LoadMMO->getFlags() &
10018 LoadMMO =
10020 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10022 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10023 LoadMMO->getAAInfo());
10024
10025 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10026 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10027
10028 return SDValue(Load, 0);
10029 }
10030 case Intrinsic::amdgcn_end_cf:
10031 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10032 Op->getOperand(2), Chain),
10033 0);
10034 case Intrinsic::amdgcn_s_barrier_init:
10035 case Intrinsic::amdgcn_s_barrier_signal_var: {
10036 // these two intrinsics have two operands: barrier pointer and member count
10037 SDValue Chain = Op->getOperand(0);
10039 SDValue BarOp = Op->getOperand(2);
10040 SDValue CntOp = Op->getOperand(3);
10041 SDValue M0Val;
10042 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10043 ? AMDGPU::S_BARRIER_INIT_M0
10044 : AMDGPU::S_BARRIER_SIGNAL_M0;
10045 // extract the BarrierID from bits 4-9 of BarOp
10046 SDValue BarID;
10047 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10048 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10049 BarID =
10050 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10051 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10052 0);
10053 // Member count should be put into M0[ShAmt:+6]
10054 // Barrier ID should be put into M0[5:0]
10055 M0Val =
10056 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10057 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10058 0);
10059 constexpr unsigned ShAmt = 16;
10060 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10061 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10062
10063 M0Val = SDValue(
10064 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10065
10066 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10067
10068 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10069 return SDValue(NewMI, 0);
10070 }
10071 case Intrinsic::amdgcn_s_barrier_join:
10072 case Intrinsic::amdgcn_s_wakeup_barrier: {
10073 // these three intrinsics have one operand: barrier pointer
10074 SDValue Chain = Op->getOperand(0);
10076 SDValue BarOp = Op->getOperand(2);
10077 unsigned Opc;
10078
10079 if (isa<ConstantSDNode>(BarOp)) {
10080 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10081 switch (IntrinsicID) {
10082 default:
10083 return SDValue();
10084 case Intrinsic::amdgcn_s_barrier_join:
10085 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10086 break;
10087 case Intrinsic::amdgcn_s_wakeup_barrier:
10088 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
10089 break;
10090 }
10091 // extract the BarrierID from bits 4-9 of the immediate
10092 unsigned BarID = (BarVal >> 4) & 0x3F;
10093 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10094 Ops.push_back(K);
10095 Ops.push_back(Chain);
10096 } else {
10097 switch (IntrinsicID) {
10098 default:
10099 return SDValue();
10100 case Intrinsic::amdgcn_s_barrier_join:
10101 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10102 break;
10103 case Intrinsic::amdgcn_s_wakeup_barrier:
10104 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
10105 break;
10106 }
10107 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10108 SDValue M0Val;
10109 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10110 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10111 M0Val =
10112 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10113 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10114 0);
10115 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10116 }
10117
10118 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10119 return SDValue(NewMI, 0);
10120 }
10121 case Intrinsic::amdgcn_s_prefetch_data: {
10122 // For non-global address space preserve the chain and remove the call.
10123 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10124 return Op.getOperand(0);
10125 return Op;
10126 }
10127 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10128 SDValue Ops[] = {
10129 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10130 Op.getOperand(3), // offset
10131 Op.getOperand(4), // length
10132 };
10133
10134 MemSDNode *M = cast<MemSDNode>(Op);
10136 Op->getVTList(), Ops, M->getMemoryVT(),
10137 M->getMemOperand());
10138 }
10139 default: {
10140 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10142 return lowerImage(Op, ImageDimIntr, DAG, true);
10143
10144 return Op;
10145 }
10146 }
10147}
10148
10149// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10150// offset (the offset that is included in bounds checking and swizzling, to be
10151// split between the instruction's voffset and immoffset fields) and soffset
10152// (the offset that is excluded from bounds checking and swizzling, to go in
10153// the instruction's soffset field). This function takes the first kind of
10154// offset and figures out how to split it between voffset and immoffset.
10155std::pair<SDValue, SDValue>
10156SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10157 SDLoc DL(Offset);
10158 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10159 SDValue N0 = Offset;
10160 ConstantSDNode *C1 = nullptr;
10161
10162 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10163 N0 = SDValue();
10164 else if (DAG.isBaseWithConstantOffset(N0)) {
10165 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10166 N0 = N0.getOperand(0);
10167 }
10168
10169 if (C1) {
10170 unsigned ImmOffset = C1->getZExtValue();
10171 // If the immediate value is too big for the immoffset field, put only bits
10172 // that would normally fit in the immoffset field. The remaining value that
10173 // is copied/added for the voffset field is a large power of 2, and it
10174 // stands more chance of being CSEd with the copy/add for another similar
10175 // load/store.
10176 // However, do not do that rounding down if that is a negative
10177 // number, as it appears to be illegal to have a negative offset in the
10178 // vgpr, even if adding the immediate offset makes it positive.
10179 unsigned Overflow = ImmOffset & ~MaxImm;
10180 ImmOffset -= Overflow;
10181 if ((int32_t)Overflow < 0) {
10182 Overflow += ImmOffset;
10183 ImmOffset = 0;
10184 }
10185 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10186 if (Overflow) {
10187 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10188 if (!N0)
10189 N0 = OverflowVal;
10190 else {
10191 SDValue Ops[] = {N0, OverflowVal};
10192 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10193 }
10194 }
10195 }
10196 if (!N0)
10197 N0 = DAG.getConstant(0, DL, MVT::i32);
10198 if (!C1)
10199 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10200 return {N0, SDValue(C1, 0)};
10201}
10202
10203// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10204// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10205// pointed to by Offsets.
10206void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10207 SelectionDAG &DAG, SDValue *Offsets,
10208 Align Alignment) const {
10210 SDLoc DL(CombinedOffset);
10211 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10212 uint32_t Imm = C->getZExtValue();
10213 uint32_t SOffset, ImmOffset;
10214 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10215 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10216 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10217 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10218 return;
10219 }
10220 }
10221 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10222 SDValue N0 = CombinedOffset.getOperand(0);
10223 SDValue N1 = CombinedOffset.getOperand(1);
10224 uint32_t SOffset, ImmOffset;
10225 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10226 if (Offset >= 0 &&
10227 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10228 Offsets[0] = N0;
10229 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10230 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10231 return;
10232 }
10233 }
10234
10235 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10236 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10237 : DAG.getConstant(0, DL, MVT::i32);
10238
10239 Offsets[0] = CombinedOffset;
10240 Offsets[1] = SOffsetZero;
10241 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10242}
10243
10244SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10245 SelectionDAG &DAG) const {
10246 if (!MaybePointer.getValueType().isScalarInteger())
10247 return MaybePointer;
10248
10249 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10250 return Rsrc;
10251}
10252
10253// Wrap a global or flat pointer into a buffer intrinsic using the flags
10254// specified in the intrinsic.
10255SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10256 SelectionDAG &DAG) const {
10257 SDLoc Loc(Op);
10258
10259 SDValue Pointer = Op->getOperand(1);
10260 SDValue Stride = Op->getOperand(2);
10261 SDValue NumRecords = Op->getOperand(3);
10262 SDValue Flags = Op->getOperand(4);
10263
10264 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10265 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10266 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10267 std::optional<uint32_t> ConstStride = std::nullopt;
10268 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10269 ConstStride = ConstNode->getZExtValue();
10270
10271 SDValue NewHighHalf = Masked;
10272 if (!ConstStride || *ConstStride != 0) {
10273 SDValue ShiftedStride;
10274 if (ConstStride) {
10275 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10276 } else {
10277 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10278 ShiftedStride =
10279 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10280 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10281 }
10282 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10283 }
10284
10285 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10286 NewHighHalf, NumRecords, Flags);
10287 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10288 return RsrcPtr;
10289}
10290
10291// Handle 8 bit and 16 bit buffer loads
10292SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10293 EVT LoadVT, SDLoc DL,
10295 MachineMemOperand *MMO,
10296 bool IsTFE) const {
10297 EVT IntVT = LoadVT.changeTypeToInteger();
10298
10299 if (IsTFE) {
10300 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10304 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10305 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10306 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10308 DAG.getConstant(1, DL, MVT::i32));
10309 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10310 DAG.getConstant(0, DL, MVT::i32));
10311 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10312 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10313 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10314 }
10315
10316 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10319
10320 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10321 SDValue BufferLoad =
10322 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10323 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10324 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10325
10326 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10327}
10328
10329// Handle 8 bit and 16 bit buffer stores
10330SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10331 EVT VDataType, SDLoc DL,
10332 SDValue Ops[],
10333 MemSDNode *M) const {
10334 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10335 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10336
10337 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10338 Ops[1] = BufferStoreExt;
10339 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10341 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10342 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10343 M->getMemOperand());
10344}
10345
10347 SDValue Op, const SDLoc &SL, EVT VT) {
10348 if (VT.bitsLT(Op.getValueType()))
10349 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10350
10351 switch (ExtType) {
10352 case ISD::SEXTLOAD:
10353 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10354 case ISD::ZEXTLOAD:
10355 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10356 case ISD::EXTLOAD:
10357 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10358 case ISD::NON_EXTLOAD:
10359 return Op;
10360 }
10361
10362 llvm_unreachable("invalid ext type");
10363}
10364
10365// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10366// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10367SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10368 DAGCombinerInfo &DCI) const {
10369 SelectionDAG &DAG = DCI.DAG;
10370 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10371 return SDValue();
10372
10373 // FIXME: Constant loads should all be marked invariant.
10374 unsigned AS = Ld->getAddressSpace();
10375 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10377 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10378 return SDValue();
10379
10380 // Don't do this early, since it may interfere with adjacent load merging for
10381 // illegal types. We can avoid losing alignment information for exotic types
10382 // pre-legalize.
10383 EVT MemVT = Ld->getMemoryVT();
10384 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10385 MemVT.getSizeInBits() >= 32)
10386 return SDValue();
10387
10388 SDLoc SL(Ld);
10389
10390 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10391 "unexpected vector extload");
10392
10393 // TODO: Drop only high part of range.
10394 SDValue Ptr = Ld->getBasePtr();
10395 SDValue NewLoad = DAG.getLoad(
10396 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10397 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10398 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10399 nullptr); // Drop ranges
10400
10401 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10402 if (MemVT.isFloatingPoint()) {
10404 "unexpected fp extload");
10405 TruncVT = MemVT.changeTypeToInteger();
10406 }
10407
10408 SDValue Cvt = NewLoad;
10409 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10410 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10411 DAG.getValueType(TruncVT));
10412 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10414 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10415 } else {
10417 }
10418
10419 EVT VT = Ld->getValueType(0);
10420 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10421
10422 DCI.AddToWorklist(Cvt.getNode());
10423
10424 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10425 // the appropriate extension from the 32-bit load.
10426 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10427 DCI.AddToWorklist(Cvt.getNode());
10428
10429 // Handle conversion back to floating point if necessary.
10430 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10431
10432 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10433}
10434
10436 const SIMachineFunctionInfo &Info) {
10437 // TODO: Should check if the address can definitely not access stack.
10438 if (Info.isEntryFunction())
10439 return Info.getUserSGPRInfo().hasFlatScratchInit();
10440 return true;
10441}
10442
10443SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10444 SDLoc DL(Op);
10445 LoadSDNode *Load = cast<LoadSDNode>(Op);
10446 ISD::LoadExtType ExtType = Load->getExtensionType();
10447 EVT MemVT = Load->getMemoryVT();
10448 MachineMemOperand *MMO = Load->getMemOperand();
10449
10450 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10451 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10452 return SDValue();
10453
10454 // FIXME: Copied from PPC
10455 // First, load into 32 bits, then truncate to 1 bit.
10456
10457 SDValue Chain = Load->getChain();
10458 SDValue BasePtr = Load->getBasePtr();
10459
10460 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10461
10462 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10463 RealMemVT, MMO);
10464
10465 if (!MemVT.isVector()) {
10466 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10467 NewLD.getValue(1)};
10468
10469 return DAG.getMergeValues(Ops, DL);
10470 }
10471
10473 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10474 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10475 DAG.getConstant(I, DL, MVT::i32));
10476
10477 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10478 }
10479
10480 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10481
10482 return DAG.getMergeValues(Ops, DL);
10483 }
10484
10485 if (!MemVT.isVector())
10486 return SDValue();
10487
10488 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10489 "Custom lowering for non-i32 vectors hasn't been implemented.");
10490
10491 Align Alignment = Load->getAlign();
10492 unsigned AS = Load->getAddressSpace();
10493 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10494 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10495 return SplitVectorLoad(Op, DAG);
10496 }
10497
10500 // If there is a possibility that flat instruction access scratch memory
10501 // then we need to use the same legalization rules we use for private.
10502 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10504 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10507
10508 unsigned NumElements = MemVT.getVectorNumElements();
10509
10510 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10512 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10513 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10515 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10516 Alignment >= Align(4) && NumElements < 32) {
10517 if (MemVT.isPow2VectorType() ||
10518 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10519 return SDValue();
10520 return WidenOrSplitVectorLoad(Op, DAG);
10521 }
10522 // Non-uniform loads will be selected to MUBUF instructions, so they
10523 // have the same legalization requirements as global and private
10524 // loads.
10525 //
10526 }
10527 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10530 if (NumElements > 4)
10531 return SplitVectorLoad(Op, DAG);
10532 // v3 loads not supported on SI.
10533 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10534 return WidenOrSplitVectorLoad(Op, DAG);
10535
10536 // v3 and v4 loads are supported for private and global memory.
10537 return SDValue();
10538 }
10539 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10540 // Depending on the setting of the private_element_size field in the
10541 // resource descriptor, we can only make private accesses up to a certain
10542 // size.
10543 switch (Subtarget->getMaxPrivateElementSize()) {
10544 case 4: {
10545 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10546 return DAG.getMergeValues({Op0, Op1}, DL);
10547 }
10548 case 8:
10549 if (NumElements > 2)
10550 return SplitVectorLoad(Op, DAG);
10551 return SDValue();
10552 case 16:
10553 // Same as global/flat
10554 if (NumElements > 4)
10555 return SplitVectorLoad(Op, DAG);
10556 // v3 loads not supported on SI.
10557 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10558 return WidenOrSplitVectorLoad(Op, DAG);
10559
10560 return SDValue();
10561 default:
10562 llvm_unreachable("unsupported private_element_size");
10563 }
10564 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10565 unsigned Fast = 0;
10566 auto Flags = Load->getMemOperand()->getFlags();
10568 Load->getAlign(), Flags, &Fast) &&
10569 Fast > 1)
10570 return SDValue();
10571
10572 if (MemVT.isVector())
10573 return SplitVectorLoad(Op, DAG);
10574 }
10575
10577 MemVT, *Load->getMemOperand())) {
10578 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10579 return DAG.getMergeValues({Op0, Op1}, DL);
10580 }
10581
10582 return SDValue();
10583}
10584
10585SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10586 EVT VT = Op.getValueType();
10587 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10588 VT.getSizeInBits() == 512)
10589 return splitTernaryVectorOp(Op, DAG);
10590
10591 assert(VT.getSizeInBits() == 64);
10592
10593 SDLoc DL(Op);
10594 SDValue Cond = Op.getOperand(0);
10595
10596 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10597 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10598
10599 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10600 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10601
10602 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10603 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10604
10605 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10606
10607 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10608 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10609
10610 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10611
10612 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10613 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10614}
10615
10616// Catch division cases where we can use shortcuts with rcp and rsq
10617// instructions.
10618SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10619 SelectionDAG &DAG) const {
10620 SDLoc SL(Op);
10621 SDValue LHS = Op.getOperand(0);
10622 SDValue RHS = Op.getOperand(1);
10623 EVT VT = Op.getValueType();
10624 const SDNodeFlags Flags = Op->getFlags();
10625
10626 bool AllowInaccurateRcp =
10627 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10628
10629 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10630 // Without !fpmath accuracy information, we can't do more because we don't
10631 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10632 // f16 is always accurate enough
10633 if (!AllowInaccurateRcp && VT != MVT::f16)
10634 return SDValue();
10635
10636 if (CLHS->isExactlyValue(1.0)) {
10637 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10638 // the CI documentation has a worst case error of 1 ulp.
10639 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10640 // use it as long as we aren't trying to use denormals.
10641 //
10642 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10643
10644 // 1.0 / sqrt(x) -> rsq(x)
10645
10646 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10647 // error seems really high at 2^29 ULP.
10648 // 1.0 / x -> rcp(x)
10649 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10650 }
10651
10652 // Same as for 1.0, but expand the sign out of the constant.
10653 if (CLHS->isExactlyValue(-1.0)) {
10654 // -1.0 / x -> rcp (fneg x)
10655 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10656 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10657 }
10658 }
10659
10660 // For f16 require afn or arcp.
10661 // For f32 require afn.
10662 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10663 return SDValue();
10664
10665 // Turn into multiply by the reciprocal.
10666 // x / y -> x * (1.0 / y)
10667 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10668 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10669}
10670
10671SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10672 SelectionDAG &DAG) const {
10673 SDLoc SL(Op);
10674 SDValue X = Op.getOperand(0);
10675 SDValue Y = Op.getOperand(1);
10676 EVT VT = Op.getValueType();
10677 const SDNodeFlags Flags = Op->getFlags();
10678
10679 bool AllowInaccurateDiv =
10680 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10681 if (!AllowInaccurateDiv)
10682 return SDValue();
10683
10684 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10685 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10686
10687 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10688 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10689
10690 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10691 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10692 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10693 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10694 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10695 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10696}
10697
10698static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10699 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10700 SDNodeFlags Flags) {
10701 if (GlueChain->getNumValues() <= 1) {
10702 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10703 }
10704
10705 assert(GlueChain->getNumValues() == 3);
10706
10707 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10708 switch (Opcode) {
10709 default:
10710 llvm_unreachable("no chain equivalent for opcode");
10711 case ISD::FMUL:
10712 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10713 break;
10714 }
10715
10716 return DAG.getNode(Opcode, SL, VTList,
10717 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10718 Flags);
10719}
10720
10721static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10722 EVT VT, SDValue A, SDValue B, SDValue C,
10723 SDValue GlueChain, SDNodeFlags Flags) {
10724 if (GlueChain->getNumValues() <= 1) {
10725 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10726 }
10727
10728 assert(GlueChain->getNumValues() == 3);
10729
10730 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10731 switch (Opcode) {
10732 default:
10733 llvm_unreachable("no chain equivalent for opcode");
10734 case ISD::FMA:
10735 Opcode = AMDGPUISD::FMA_W_CHAIN;
10736 break;
10737 }
10738
10739 return DAG.getNode(Opcode, SL, VTList,
10740 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10741 Flags);
10742}
10743
10744SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10745 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10746 return FastLowered;
10747
10748 SDLoc SL(Op);
10749 SDValue LHS = Op.getOperand(0);
10750 SDValue RHS = Op.getOperand(1);
10751
10752 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10753 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10754 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10755 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10756 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10757 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10758 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10759 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10760 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10761 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10762 // q16.u = opx(V_CVT_F16_F32, q32.u);
10763 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10764
10765 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10766 unsigned FMADOpCode =
10768
10769 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10770 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10771 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10772 SDValue Rcp =
10773 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10774 SDValue Quot =
10775 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10776 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10777 Op->getFlags());
10778 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10779 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10780 Op->getFlags());
10781 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10782 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10783 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10784 DAG.getConstant(0xff800000, SL, MVT::i32));
10785 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10786 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10787 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10788 DAG.getTargetConstant(0, SL, MVT::i32));
10789 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10790 Op->getFlags());
10791}
10792
10793// Faster 2.5 ULP division that does not support denormals.
10794SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10795 SDNodeFlags Flags = Op->getFlags();
10796 SDLoc SL(Op);
10797 SDValue LHS = Op.getOperand(1);
10798 SDValue RHS = Op.getOperand(2);
10799
10800 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10801
10802 const APFloat K0Val(0x1p+96f);
10803 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10804
10805 const APFloat K1Val(0x1p-32f);
10806 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10807
10808 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10809
10810 EVT SetCCVT =
10811 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10812
10813 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10814
10815 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10816
10817 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10818
10819 // rcp does not support denormals.
10820 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10821
10822 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10823
10824 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10825}
10826
10827// Returns immediate value for setting the F32 denorm mode when using the
10828// S_DENORM_MODE instruction.
10830 const SIMachineFunctionInfo *Info,
10831 const GCNSubtarget *ST) {
10832 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10833 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10834 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10835 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10836}
10837
10838SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10839 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10840 return FastLowered;
10841
10842 // The selection matcher assumes anything with a chain selecting to a
10843 // mayRaiseFPException machine instruction. Since we're introducing a chain
10844 // here, we need to explicitly report nofpexcept for the regular fdiv
10845 // lowering.
10846 SDNodeFlags Flags = Op->getFlags();
10847 Flags.setNoFPExcept(true);
10848
10849 SDLoc SL(Op);
10850 SDValue LHS = Op.getOperand(0);
10851 SDValue RHS = Op.getOperand(1);
10852
10853 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10854
10855 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10856
10857 SDValue DenominatorScaled =
10858 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10859 SDValue NumeratorScaled =
10860 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10861
10862 // Denominator is scaled to not be denormal, so using rcp is ok.
10863 SDValue ApproxRcp =
10864 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10865 SDValue NegDivScale0 =
10866 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10867
10868 using namespace AMDGPU::Hwreg;
10869 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10870 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10871
10872 const MachineFunction &MF = DAG.getMachineFunction();
10874 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10875
10876 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10877 const bool HasDynamicDenormals =
10878 (DenormMode.Input == DenormalMode::Dynamic) ||
10879 (DenormMode.Output == DenormalMode::Dynamic);
10880
10881 SDValue SavedDenormMode;
10882
10883 if (!PreservesDenormals) {
10884 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10885 // lowering. The chain dependence is insufficient, and we need glue. We do
10886 // not need the glue variants in a strictfp function.
10887
10888 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10889
10890 SDValue Glue = DAG.getEntryNode();
10891 if (HasDynamicDenormals) {
10892 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10893 DAG.getVTList(MVT::i32, MVT::Glue),
10894 {BitField, Glue});
10895 SavedDenormMode = SDValue(GetReg, 0);
10896
10897 Glue = DAG.getMergeValues(
10898 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10899 }
10900
10901 SDNode *EnableDenorm;
10902 if (Subtarget->hasDenormModeInst()) {
10903 const SDValue EnableDenormValue =
10904 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10905
10906 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10907 EnableDenormValue)
10908 .getNode();
10909 } else {
10910 const SDValue EnableDenormValue =
10911 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
10912 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10913 {EnableDenormValue, BitField, Glue});
10914 }
10915
10916 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
10917 SDValue(EnableDenorm, 1)};
10918
10919 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10920 }
10921
10922 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10923 ApproxRcp, One, NegDivScale0, Flags);
10924
10925 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10926 ApproxRcp, Fma0, Flags);
10927
10928 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
10929 Fma1, Flags);
10930
10931 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10932 NumeratorScaled, Mul, Flags);
10933
10934 SDValue Fma3 =
10935 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
10936
10937 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10938 NumeratorScaled, Fma3, Flags);
10939
10940 if (!PreservesDenormals) {
10941 SDNode *DisableDenorm;
10942 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10943 const SDValue DisableDenormValue = getSPDenormModeValue(
10944 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10945
10946 DisableDenorm =
10947 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
10948 DisableDenormValue, Fma4.getValue(2))
10949 .getNode();
10950 } else {
10951 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10952 const SDValue DisableDenormValue =
10953 HasDynamicDenormals
10954 ? SavedDenormMode
10955 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10956
10957 DisableDenorm = DAG.getMachineNode(
10958 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10959 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10960 }
10961
10962 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10963 SDValue(DisableDenorm, 0), DAG.getRoot());
10964 DAG.setRoot(OutputChain);
10965 }
10966
10967 SDValue Scale = NumeratorScaled.getValue(1);
10968 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10969 {Fma4, Fma1, Fma3, Scale}, Flags);
10970
10971 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10972}
10973
10974SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10975 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10976 return FastLowered;
10977
10978 SDLoc SL(Op);
10979 SDValue X = Op.getOperand(0);
10980 SDValue Y = Op.getOperand(1);
10981
10982 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10983
10984 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10985
10986 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10987
10988 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10989
10990 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10991
10992 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10993
10994 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10995
10996 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10997
10998 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10999
11000 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11001 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11002
11003 SDValue Fma4 =
11004 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11005
11006 SDValue Scale;
11007
11008 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11009 // Workaround a hardware bug on SI where the condition output from div_scale
11010 // is not usable.
11011
11012 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11013
11014 // Figure out if the scale to use for div_fmas.
11015 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11016 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11017 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11018 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11019
11020 SDValue NumHi =
11021 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11022 SDValue DenHi =
11023 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11024
11025 SDValue Scale0Hi =
11026 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11027 SDValue Scale1Hi =
11028 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11029
11030 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11031 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11032 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11033 } else {
11034 Scale = DivScale1.getValue(1);
11035 }
11036
11037 SDValue Fmas =
11038 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11039
11040 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11041}
11042
11043SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11044 EVT VT = Op.getValueType();
11045
11046 if (VT == MVT::f32)
11047 return LowerFDIV32(Op, DAG);
11048
11049 if (VT == MVT::f64)
11050 return LowerFDIV64(Op, DAG);
11051
11052 if (VT == MVT::f16)
11053 return LowerFDIV16(Op, DAG);
11054
11055 llvm_unreachable("Unexpected type for fdiv");
11056}
11057
11058SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11059 SDLoc dl(Op);
11060 SDValue Val = Op.getOperand(0);
11061 EVT VT = Val.getValueType();
11062 EVT ResultExpVT = Op->getValueType(1);
11063 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11064
11065 SDValue Mant = DAG.getNode(
11067 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11068
11069 SDValue Exp = DAG.getNode(
11070 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11071 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11072
11073 if (Subtarget->hasFractBug()) {
11074 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11075 SDValue Inf =
11077
11078 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11079 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11080 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11081 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11082 }
11083
11084 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11085 return DAG.getMergeValues({Mant, CastExp}, dl);
11086}
11087
11088SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11089 SDLoc DL(Op);
11090 StoreSDNode *Store = cast<StoreSDNode>(Op);
11091 EVT VT = Store->getMemoryVT();
11092
11093 if (VT == MVT::i1) {
11094 return DAG.getTruncStore(
11095 Store->getChain(), DL,
11096 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11097 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11098 }
11099
11100 assert(VT.isVector() &&
11101 Store->getValue().getValueType().getScalarType() == MVT::i32);
11102
11103 unsigned AS = Store->getAddressSpace();
11104 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11105 Store->getAlign().value() < VT.getStoreSize() &&
11106 VT.getSizeInBits() > 32) {
11107 return SplitVectorStore(Op, DAG);
11108 }
11109
11112 // If there is a possibility that flat instruction access scratch memory
11113 // then we need to use the same legalization rules we use for private.
11114 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11116 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11119
11120 unsigned NumElements = VT.getVectorNumElements();
11122 if (NumElements > 4)
11123 return SplitVectorStore(Op, DAG);
11124 // v3 stores not supported on SI.
11125 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11126 return SplitVectorStore(Op, DAG);
11127
11129 VT, *Store->getMemOperand()))
11130 return expandUnalignedStore(Store, DAG);
11131
11132 return SDValue();
11133 }
11134 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11135 switch (Subtarget->getMaxPrivateElementSize()) {
11136 case 4:
11137 return scalarizeVectorStore(Store, DAG);
11138 case 8:
11139 if (NumElements > 2)
11140 return SplitVectorStore(Op, DAG);
11141 return SDValue();
11142 case 16:
11143 if (NumElements > 4 ||
11144 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11145 return SplitVectorStore(Op, DAG);
11146 return SDValue();
11147 default:
11148 llvm_unreachable("unsupported private_element_size");
11149 }
11150 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11151 unsigned Fast = 0;
11152 auto Flags = Store->getMemOperand()->getFlags();
11154 Store->getAlign(), Flags, &Fast) &&
11155 Fast > 1)
11156 return SDValue();
11157
11158 if (VT.isVector())
11159 return SplitVectorStore(Op, DAG);
11160
11161 return expandUnalignedStore(Store, DAG);
11162 }
11163
11164 // Probably an invalid store. If so we'll end up emitting a selection error.
11165 return SDValue();
11166}
11167
11168// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11169SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11170 SDLoc SL(Op);
11171 assert(!Subtarget->has16BitInsts());
11172 SDNodeFlags Flags = Op->getFlags();
11173 SDValue Ext =
11174 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11175
11176 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11177 SDValue Sqrt =
11178 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11179
11180 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11181 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11182}
11183
11184SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11185 SDLoc DL(Op);
11186 SDNodeFlags Flags = Op->getFlags();
11187 MVT VT = Op.getValueType().getSimpleVT();
11188 const SDValue X = Op.getOperand(0);
11189
11190 if (allowApproxFunc(DAG, Flags)) {
11191 // Instruction is 1ulp but ignores denormals.
11192 return DAG.getNode(
11194 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11195 }
11196
11197 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11198 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11199
11200 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11201
11202 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11203
11204 SDValue SqrtX =
11205 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11206
11207 SDValue SqrtS;
11208 if (needsDenormHandlingF32(DAG, X, Flags)) {
11209 SDValue SqrtID =
11210 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11211 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11212
11213 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11214 SDValue SqrtSNextDownInt =
11215 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11216 DAG.getAllOnesConstant(DL, MVT::i32));
11217 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11218
11219 SDValue NegSqrtSNextDown =
11220 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11221
11222 SDValue SqrtVP =
11223 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11224
11225 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11226 DAG.getConstant(1, DL, MVT::i32));
11227 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11228
11229 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11230 SDValue SqrtVS =
11231 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11232
11233 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11234 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11235
11236 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11237 Flags);
11238
11239 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11240 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11241 Flags);
11242 } else {
11243 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11244
11245 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11246
11247 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11248 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11249 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11250
11251 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11252 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11253 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11254
11255 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11256 SDValue SqrtD =
11257 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11258 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11259 }
11260
11261 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11262
11263 SDValue ScaledDown =
11264 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11265
11266 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11267 SDValue IsZeroOrInf =
11268 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11269 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11270
11271 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11272}
11273
11274SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11275 // For double type, the SQRT and RSQ instructions don't have required
11276 // precision, we apply Goldschmidt's algorithm to improve the result:
11277 //
11278 // y0 = rsq(x)
11279 // g0 = x * y0
11280 // h0 = 0.5 * y0
11281 //
11282 // r0 = 0.5 - h0 * g0
11283 // g1 = g0 * r0 + g0
11284 // h1 = h0 * r0 + h0
11285 //
11286 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11287 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11288 // h2 = h1 * r1 + h1
11289 //
11290 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11291 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11292 //
11293 // sqrt(x) = g3
11294
11295 SDNodeFlags Flags = Op->getFlags();
11296
11297 SDLoc DL(Op);
11298
11299 SDValue X = Op.getOperand(0);
11300 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11301
11302 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11303
11304 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11305
11306 // Scale up input if it is too small.
11307 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11308 SDValue ScaleUp =
11309 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11310 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11311
11312 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11313
11314 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11315
11316 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11317 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11318
11319 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11320 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11321
11322 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11323
11324 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11325
11326 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11327 SDValue SqrtD0 =
11328 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11329
11330 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11331
11332 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11333 SDValue SqrtD1 =
11334 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11335
11336 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11337
11338 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11339 SDValue ScaleDown =
11340 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11341 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11342
11343 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11344 // with finite only or nsz because rsq(+/-0) = +/-inf
11345
11346 // TODO: Check for DAZ and expand to subnormals
11347 SDValue IsZeroOrInf =
11348 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11349 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11350
11351 // If x is +INF, +0, or -0, use its original value
11352 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11353 Flags);
11354}
11355
11356SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11357 SDLoc DL(Op);
11358 EVT VT = Op.getValueType();
11359 SDValue Arg = Op.getOperand(0);
11360 SDValue TrigVal;
11361
11362 // Propagate fast-math flags so that the multiply we introduce can be folded
11363 // if Arg is already the result of a multiply by constant.
11364 auto Flags = Op->getFlags();
11365
11366 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11367
11368 if (Subtarget->hasTrigReducedRange()) {
11369 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11370 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11371 } else {
11372 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11373 }
11374
11375 switch (Op.getOpcode()) {
11376 case ISD::FCOS:
11377 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11378 case ISD::FSIN:
11379 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11380 default:
11381 llvm_unreachable("Wrong trig opcode");
11382 }
11383}
11384
11385SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11386 SelectionDAG &DAG) const {
11387 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11388 assert(AtomicNode->isCompareAndSwap());
11389 unsigned AS = AtomicNode->getAddressSpace();
11390
11391 // No custom lowering required for local address space
11393 return Op;
11394
11395 // Non-local address space requires custom lowering for atomic compare
11396 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11397 SDLoc DL(Op);
11398 SDValue ChainIn = Op.getOperand(0);
11399 SDValue Addr = Op.getOperand(1);
11400 SDValue Old = Op.getOperand(2);
11401 SDValue New = Op.getOperand(3);
11402 EVT VT = Op.getValueType();
11403 MVT SimpleVT = VT.getSimpleVT();
11404 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11405
11406 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11407 SDValue Ops[] = {ChainIn, Addr, NewOld};
11408
11410 Op->getVTList(), Ops, VT,
11411 AtomicNode->getMemOperand());
11412}
11413
11414//===----------------------------------------------------------------------===//
11415// Custom DAG optimizations
11416//===----------------------------------------------------------------------===//
11417
11418SDValue
11419SITargetLowering::performUCharToFloatCombine(SDNode *N,
11420 DAGCombinerInfo &DCI) const {
11421 EVT VT = N->getValueType(0);
11422 EVT ScalarVT = VT.getScalarType();
11423 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11424 return SDValue();
11425
11426 SelectionDAG &DAG = DCI.DAG;
11427 SDLoc DL(N);
11428
11429 SDValue Src = N->getOperand(0);
11430 EVT SrcVT = Src.getValueType();
11431
11432 // TODO: We could try to match extracting the higher bytes, which would be
11433 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11434 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11435 // about in practice.
11436 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11437 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11438 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11439 DCI.AddToWorklist(Cvt.getNode());
11440
11441 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11442 if (ScalarVT != MVT::f32) {
11443 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11444 DAG.getTargetConstant(0, DL, MVT::i32));
11445 }
11446 return Cvt;
11447 }
11448 }
11449
11450 return SDValue();
11451}
11452
11453SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11454 DAGCombinerInfo &DCI) const {
11455 SDValue MagnitudeOp = N->getOperand(0);
11456 SDValue SignOp = N->getOperand(1);
11457 SelectionDAG &DAG = DCI.DAG;
11458 SDLoc DL(N);
11459
11460 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11461 // lower half with a copy.
11462 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11463 if (MagnitudeOp.getValueType() == MVT::f64) {
11464 SDValue MagAsVector =
11465 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11466 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11467 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11468 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11469 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11470
11471 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11472
11473 SDValue Vector =
11474 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11475
11476 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11477 }
11478
11479 if (SignOp.getValueType() != MVT::f64)
11480 return SDValue();
11481
11482 // Reduce width of sign operand, we only need the highest bit.
11483 //
11484 // fcopysign f64:x, f64:y ->
11485 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11486 // TODO: In some cases it might make sense to go all the way to f16.
11487 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11488 SDValue SignAsF32 =
11489 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11490 DAG.getConstant(1, DL, MVT::i32));
11491
11492 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11493 SignAsF32);
11494}
11495
11496// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11497// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11498// bits
11499
11500// This is a variant of
11501// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11502//
11503// The normal DAG combiner will do this, but only if the add has one use since
11504// that would increase the number of instructions.
11505//
11506// This prevents us from seeing a constant offset that can be folded into a
11507// memory instruction's addressing mode. If we know the resulting add offset of
11508// a pointer can be folded into an addressing offset, we can replace the pointer
11509// operand with the add of new constant offset. This eliminates one of the uses,
11510// and may allow the remaining use to also be simplified.
11511//
11512SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11513 EVT MemVT,
11514 DAGCombinerInfo &DCI) const {
11515 SDValue N0 = N->getOperand(0);
11516 SDValue N1 = N->getOperand(1);
11517
11518 // We only do this to handle cases where it's profitable when there are
11519 // multiple uses of the add, so defer to the standard combine.
11520 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11521 N0->hasOneUse())
11522 return SDValue();
11523
11524 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11525 if (!CN1)
11526 return SDValue();
11527
11528 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11529 if (!CAdd)
11530 return SDValue();
11531
11532 SelectionDAG &DAG = DCI.DAG;
11533
11534 if (N0->getOpcode() == ISD::OR &&
11535 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11536 return SDValue();
11537
11538 // If the resulting offset is too large, we can't fold it into the
11539 // addressing mode offset.
11540 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11541 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11542
11543 AddrMode AM;
11544 AM.HasBaseReg = true;
11545 AM.BaseOffs = Offset.getSExtValue();
11546 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11547 return SDValue();
11548
11549 SDLoc SL(N);
11550 EVT VT = N->getValueType(0);
11551
11552 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11553 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11554
11556 Flags.setNoUnsignedWrap(
11557 N->getFlags().hasNoUnsignedWrap() &&
11558 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11559
11560 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11561}
11562
11563/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11564/// by the chain and intrinsic ID. Theoretically we would also need to check the
11565/// specific intrinsic, but they all place the pointer operand first.
11566static unsigned getBasePtrIndex(const MemSDNode *N) {
11567 switch (N->getOpcode()) {
11568 case ISD::STORE:
11571 return 2;
11572 default:
11573 return 1;
11574 }
11575}
11576
11577SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11578 DAGCombinerInfo &DCI) const {
11579 SelectionDAG &DAG = DCI.DAG;
11580 SDLoc SL(N);
11581
11582 unsigned PtrIdx = getBasePtrIndex(N);
11583 SDValue Ptr = N->getOperand(PtrIdx);
11584
11585 // TODO: We could also do this for multiplies.
11586 if (Ptr.getOpcode() == ISD::SHL) {
11587 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11588 N->getMemoryVT(), DCI);
11589 if (NewPtr) {
11590 SmallVector<SDValue, 8> NewOps(N->ops());
11591
11592 NewOps[PtrIdx] = NewPtr;
11593 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11594 }
11595 }
11596
11597 return SDValue();
11598}
11599
11600static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11601 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11602 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11603 (Opc == ISD::XOR && Val == 0);
11604}
11605
11606// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11607// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11608// integer combine opportunities since most 64-bit operations are decomposed
11609// this way. TODO: We won't want this for SALU especially if it is an inline
11610// immediate.
11611SDValue SITargetLowering::splitBinaryBitConstantOp(
11612 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11613 const ConstantSDNode *CRHS) const {
11614 uint64_t Val = CRHS->getZExtValue();
11615 uint32_t ValLo = Lo_32(Val);
11616 uint32_t ValHi = Hi_32(Val);
11618
11619 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11620 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11621 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11622 // If we need to materialize a 64-bit immediate, it will be split up later
11623 // anyway. Avoid creating the harder to understand 64-bit immediate
11624 // materialization.
11625 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11626 }
11627
11628 return SDValue();
11629}
11630
11632 if (V.getValueType() != MVT::i1)
11633 return false;
11634 switch (V.getOpcode()) {
11635 default:
11636 break;
11637 case ISD::SETCC:
11639 return true;
11640 case ISD::AND:
11641 case ISD::OR:
11642 case ISD::XOR:
11643 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11644 }
11645 return false;
11646}
11647
11648// If a constant has all zeroes or all ones within each byte return it.
11649// Otherwise return 0.
11651 // 0xff for any zero byte in the mask
11652 uint32_t ZeroByteMask = 0;
11653 if (!(C & 0x000000ff))
11654 ZeroByteMask |= 0x000000ff;
11655 if (!(C & 0x0000ff00))
11656 ZeroByteMask |= 0x0000ff00;
11657 if (!(C & 0x00ff0000))
11658 ZeroByteMask |= 0x00ff0000;
11659 if (!(C & 0xff000000))
11660 ZeroByteMask |= 0xff000000;
11661 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11662 if ((NonZeroByteMask & C) != NonZeroByteMask)
11663 return 0; // Partial bytes selected.
11664 return C;
11665}
11666
11667// Check if a node selects whole bytes from its operand 0 starting at a byte
11668// boundary while masking the rest. Returns select mask as in the v_perm_b32
11669// or -1 if not succeeded.
11670// Note byte select encoding:
11671// value 0-3 selects corresponding source byte;
11672// value 0xc selects zero;
11673// value 0xff selects 0xff.
11675 assert(V.getValueSizeInBits() == 32);
11676
11677 if (V.getNumOperands() != 2)
11678 return ~0;
11679
11680 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11681 if (!N1)
11682 return ~0;
11683
11684 uint32_t C = N1->getZExtValue();
11685
11686 switch (V.getOpcode()) {
11687 default:
11688 break;
11689 case ISD::AND:
11690 if (uint32_t ConstMask = getConstantPermuteMask(C))
11691 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11692 break;
11693
11694 case ISD::OR:
11695 if (uint32_t ConstMask = getConstantPermuteMask(C))
11696 return (0x03020100 & ~ConstMask) | ConstMask;
11697 break;
11698
11699 case ISD::SHL:
11700 if (C % 8)
11701 return ~0;
11702
11703 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11704
11705 case ISD::SRL:
11706 if (C % 8)
11707 return ~0;
11708
11709 return uint32_t(0x0c0c0c0c03020100ull >> C);
11710 }
11711
11712 return ~0;
11713}
11714
11715SDValue SITargetLowering::performAndCombine(SDNode *N,
11716 DAGCombinerInfo &DCI) const {
11717 if (DCI.isBeforeLegalize())
11718 return SDValue();
11719
11720 SelectionDAG &DAG = DCI.DAG;
11721 EVT VT = N->getValueType(0);
11722 SDValue LHS = N->getOperand(0);
11723 SDValue RHS = N->getOperand(1);
11724
11725 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11726 if (VT == MVT::i64 && CRHS) {
11727 if (SDValue Split =
11728 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11729 return Split;
11730 }
11731
11732 if (CRHS && VT == MVT::i32) {
11733 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11734 // nb = number of trailing zeroes in mask
11735 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11736 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11737 uint64_t Mask = CRHS->getZExtValue();
11738 unsigned Bits = llvm::popcount(Mask);
11739 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11740 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11741 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11742 unsigned Shift = CShift->getZExtValue();
11743 unsigned NB = CRHS->getAPIntValue().countr_zero();
11744 unsigned Offset = NB + Shift;
11745 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11746 SDLoc SL(N);
11747 SDValue BFE =
11748 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11749 DAG.getConstant(Offset, SL, MVT::i32),
11750 DAG.getConstant(Bits, SL, MVT::i32));
11751 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11752 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11753 DAG.getValueType(NarrowVT));
11754 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11755 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11756 return Shl;
11757 }
11758 }
11759 }
11760
11761 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11762 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11763 isa<ConstantSDNode>(LHS.getOperand(2))) {
11764 uint32_t Sel = getConstantPermuteMask(Mask);
11765 if (!Sel)
11766 return SDValue();
11767
11768 // Select 0xc for all zero bytes
11769 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11770 SDLoc DL(N);
11771 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11772 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11773 }
11774 }
11775
11776 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11777 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11778 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11779 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11780 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11781
11782 SDValue X = LHS.getOperand(0);
11783 SDValue Y = RHS.getOperand(0);
11784 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11785 !isTypeLegal(X.getValueType()))
11786 return SDValue();
11787
11788 if (LCC == ISD::SETO) {
11789 if (X != LHS.getOperand(1))
11790 return SDValue();
11791
11792 if (RCC == ISD::SETUNE) {
11793 const ConstantFPSDNode *C1 =
11794 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11795 if (!C1 || !C1->isInfinity() || C1->isNegative())
11796 return SDValue();
11797
11802
11803 static_assert(
11806 0x3ff) == Mask,
11807 "mask not equal");
11808
11809 SDLoc DL(N);
11810 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11811 DAG.getConstant(Mask, DL, MVT::i32));
11812 }
11813 }
11814 }
11815
11816 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11817 std::swap(LHS, RHS);
11818
11819 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11820 RHS.hasOneUse()) {
11821 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11822 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11823 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11824 // | n_nan)
11825 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11826 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11827 (RHS.getOperand(0) == LHS.getOperand(0) &&
11828 LHS.getOperand(0) == LHS.getOperand(1))) {
11829 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11830 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11831 : Mask->getZExtValue() & OrdMask;
11832
11833 SDLoc DL(N);
11834 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11835 DAG.getConstant(NewMask, DL, MVT::i32));
11836 }
11837 }
11838
11839 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11840 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11841 // and x, (sext cc from i1) => select cc, x, 0
11842 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11843 std::swap(LHS, RHS);
11844 if (isBoolSGPR(RHS.getOperand(0)))
11845 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11846 DAG.getConstant(0, SDLoc(N), MVT::i32));
11847 }
11848
11849 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11851 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11852 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11853 uint32_t LHSMask = getPermuteMask(LHS);
11854 uint32_t RHSMask = getPermuteMask(RHS);
11855 if (LHSMask != ~0u && RHSMask != ~0u) {
11856 // Canonicalize the expression in an attempt to have fewer unique masks
11857 // and therefore fewer registers used to hold the masks.
11858 if (LHSMask > RHSMask) {
11859 std::swap(LHSMask, RHSMask);
11860 std::swap(LHS, RHS);
11861 }
11862
11863 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11864 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11865 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11866 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11867
11868 // Check of we need to combine values from two sources within a byte.
11869 if (!(LHSUsedLanes & RHSUsedLanes) &&
11870 // If we select high and lower word keep it for SDWA.
11871 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11872 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11873 // Each byte in each mask is either selector mask 0-3, or has higher
11874 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11875 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11876 // mask which is not 0xff wins. By anding both masks we have a correct
11877 // result except that 0x0c shall be corrected to give 0x0c only.
11878 uint32_t Mask = LHSMask & RHSMask;
11879 for (unsigned I = 0; I < 32; I += 8) {
11880 uint32_t ByteSel = 0xff << I;
11881 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11882 Mask &= (0x0c << I) & 0xffffffff;
11883 }
11884
11885 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11886 // or 0x0c.
11887 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11888 SDLoc DL(N);
11889
11890 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11891 RHS.getOperand(0),
11892 DAG.getConstant(Sel, DL, MVT::i32));
11893 }
11894 }
11895 }
11896
11897 return SDValue();
11898}
11899
11900// A key component of v_perm is a mapping between byte position of the src
11901// operands, and the byte position of the dest. To provide such, we need: 1. the
11902// node that provides x byte of the dest of the OR, and 2. the byte of the node
11903// used to provide that x byte. calculateByteProvider finds which node provides
11904// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11905// and finds an ultimate src and byte position For example: The supported
11906// LoadCombine pattern for vector loads is as follows
11907// t1
11908// or
11909// / \
11910// t2 t3
11911// zext shl
11912// | | \
11913// t4 t5 16
11914// or anyext
11915// / \ |
11916// t6 t7 t8
11917// srl shl or
11918// / | / \ / \
11919// t9 t10 t11 t12 t13 t14
11920// trunc* 8 trunc* 8 and and
11921// | | / | | \
11922// t15 t16 t17 t18 t19 t20
11923// trunc* 255 srl -256
11924// | / \
11925// t15 t15 16
11926//
11927// *In this example, the truncs are from i32->i16
11928//
11929// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11930// respectively. calculateSrcByte would find (given node) -> ultimate src &
11931// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11932// After finding the mapping, we can combine the tree into vperm t15, t16,
11933// 0x05000407
11934
11935// Find the source and byte position from a node.
11936// \p DestByte is the byte position of the dest of the or that the src
11937// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11938// dest of the or byte. \p Depth tracks how many recursive iterations we have
11939// performed.
11940static const std::optional<ByteProvider<SDValue>>
11941calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11942 unsigned Depth = 0) {
11943 // We may need to recursively traverse a series of SRLs
11944 if (Depth >= 6)
11945 return std::nullopt;
11946
11947 if (Op.getValueSizeInBits() < 8)
11948 return std::nullopt;
11949
11950 if (Op.getValueType().isVector())
11951 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11952
11953 switch (Op->getOpcode()) {
11954 case ISD::TRUNCATE: {
11955 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11956 }
11957
11958 case ISD::SIGN_EXTEND:
11959 case ISD::ZERO_EXTEND:
11961 SDValue NarrowOp = Op->getOperand(0);
11962 auto NarrowVT = NarrowOp.getValueType();
11963 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11964 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11965 NarrowVT = VTSign->getVT();
11966 }
11967 if (!NarrowVT.isByteSized())
11968 return std::nullopt;
11969 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11970
11971 if (SrcIndex >= NarrowByteWidth)
11972 return std::nullopt;
11973 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11974 }
11975
11976 case ISD::SRA:
11977 case ISD::SRL: {
11978 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11979 if (!ShiftOp)
11980 return std::nullopt;
11981
11982 uint64_t BitShift = ShiftOp->getZExtValue();
11983
11984 if (BitShift % 8 != 0)
11985 return std::nullopt;
11986
11987 SrcIndex += BitShift / 8;
11988
11989 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11990 }
11991
11992 default: {
11993 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11994 }
11995 }
11996 llvm_unreachable("fully handled switch");
11997}
11998
11999// For a byte position in the result of an Or, traverse the tree and find the
12000// node (and the byte of the node) which ultimately provides this {Or,
12001// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12002// the byte position of the Op that corresponds with the originally requested
12003// byte of the Or \p Depth tracks how many recursive iterations we have
12004// performed. \p StartingIndex is the originally requested byte of the Or
12005static const std::optional<ByteProvider<SDValue>>
12006calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12007 unsigned StartingIndex = 0) {
12008 // Finding Src tree of RHS of or typically requires at least 1 additional
12009 // depth
12010 if (Depth > 6)
12011 return std::nullopt;
12012
12013 unsigned BitWidth = Op.getScalarValueSizeInBits();
12014 if (BitWidth % 8 != 0)
12015 return std::nullopt;
12016 if (Index > BitWidth / 8 - 1)
12017 return std::nullopt;
12018
12019 bool IsVec = Op.getValueType().isVector();
12020 switch (Op.getOpcode()) {
12021 case ISD::OR: {
12022 if (IsVec)
12023 return std::nullopt;
12024
12025 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12026 StartingIndex);
12027 if (!RHS)
12028 return std::nullopt;
12029 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12030 StartingIndex);
12031 if (!LHS)
12032 return std::nullopt;
12033 // A well formed Or will have two ByteProviders for each byte, one of which
12034 // is constant zero
12035 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12036 return std::nullopt;
12037 if (!LHS || LHS->isConstantZero())
12038 return RHS;
12039 if (!RHS || RHS->isConstantZero())
12040 return LHS;
12041 return std::nullopt;
12042 }
12043
12044 case ISD::AND: {
12045 if (IsVec)
12046 return std::nullopt;
12047
12048 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12049 if (!BitMaskOp)
12050 return std::nullopt;
12051
12052 uint32_t BitMask = BitMaskOp->getZExtValue();
12053 // Bits we expect for our StartingIndex
12054 uint32_t IndexMask = 0xFF << (Index * 8);
12055
12056 if ((IndexMask & BitMask) != IndexMask) {
12057 // If the result of the and partially provides the byte, then it
12058 // is not well formatted
12059 if (IndexMask & BitMask)
12060 return std::nullopt;
12062 }
12063
12064 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12065 }
12066
12067 case ISD::FSHR: {
12068 if (IsVec)
12069 return std::nullopt;
12070
12071 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12072 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12073 if (!ShiftOp || Op.getValueType().isVector())
12074 return std::nullopt;
12075
12076 uint64_t BitsProvided = Op.getValueSizeInBits();
12077 if (BitsProvided % 8 != 0)
12078 return std::nullopt;
12079
12080 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12081 if (BitShift % 8)
12082 return std::nullopt;
12083
12084 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12085 uint64_t ByteShift = BitShift / 8;
12086
12087 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12088 uint64_t BytesProvided = BitsProvided / 8;
12089 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12090 NewIndex %= BytesProvided;
12091 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12092 }
12093
12094 case ISD::SRA:
12095 case ISD::SRL: {
12096 if (IsVec)
12097 return std::nullopt;
12098
12099 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12100 if (!ShiftOp)
12101 return std::nullopt;
12102
12103 uint64_t BitShift = ShiftOp->getZExtValue();
12104 if (BitShift % 8)
12105 return std::nullopt;
12106
12107 auto BitsProvided = Op.getScalarValueSizeInBits();
12108 if (BitsProvided % 8 != 0)
12109 return std::nullopt;
12110
12111 uint64_t BytesProvided = BitsProvided / 8;
12112 uint64_t ByteShift = BitShift / 8;
12113 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12114 // If the byte we are trying to provide (as tracked by index) falls in this
12115 // range, then the SRL provides the byte. The byte of interest of the src of
12116 // the SRL is Index + ByteShift
12117 return BytesProvided - ByteShift > Index
12118 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12119 Index + ByteShift)
12121 }
12122
12123 case ISD::SHL: {
12124 if (IsVec)
12125 return std::nullopt;
12126
12127 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12128 if (!ShiftOp)
12129 return std::nullopt;
12130
12131 uint64_t BitShift = ShiftOp->getZExtValue();
12132 if (BitShift % 8 != 0)
12133 return std::nullopt;
12134 uint64_t ByteShift = BitShift / 8;
12135
12136 // If we are shifting by an amount greater than (or equal to)
12137 // the index we are trying to provide, then it provides 0s. If not,
12138 // then this bytes are not definitively 0s, and the corresponding byte
12139 // of interest is Index - ByteShift of the src
12140 return Index < ByteShift
12142 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12143 Depth + 1, StartingIndex);
12144 }
12145 case ISD::ANY_EXTEND:
12146 case ISD::SIGN_EXTEND:
12147 case ISD::ZERO_EXTEND:
12149 case ISD::AssertZext:
12150 case ISD::AssertSext: {
12151 if (IsVec)
12152 return std::nullopt;
12153
12154 SDValue NarrowOp = Op->getOperand(0);
12155 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12156 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12157 Op->getOpcode() == ISD::AssertZext ||
12158 Op->getOpcode() == ISD::AssertSext) {
12159 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12160 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12161 }
12162 if (NarrowBitWidth % 8 != 0)
12163 return std::nullopt;
12164 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12165
12166 if (Index >= NarrowByteWidth)
12167 return Op.getOpcode() == ISD::ZERO_EXTEND
12168 ? std::optional<ByteProvider<SDValue>>(
12170 : std::nullopt;
12171 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12172 }
12173
12174 case ISD::TRUNCATE: {
12175 if (IsVec)
12176 return std::nullopt;
12177
12178 uint64_t NarrowByteWidth = BitWidth / 8;
12179
12180 if (NarrowByteWidth >= Index) {
12181 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12182 StartingIndex);
12183 }
12184
12185 return std::nullopt;
12186 }
12187
12188 case ISD::CopyFromReg: {
12189 if (BitWidth / 8 > Index)
12190 return calculateSrcByte(Op, StartingIndex, Index);
12191
12192 return std::nullopt;
12193 }
12194
12195 case ISD::LOAD: {
12196 auto *L = cast<LoadSDNode>(Op.getNode());
12197
12198 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12199 if (NarrowBitWidth % 8 != 0)
12200 return std::nullopt;
12201 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12202
12203 // If the width of the load does not reach byte we are trying to provide for
12204 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12205 // question
12206 if (Index >= NarrowByteWidth) {
12207 return L->getExtensionType() == ISD::ZEXTLOAD
12208 ? std::optional<ByteProvider<SDValue>>(
12210 : std::nullopt;
12211 }
12212
12213 if (NarrowByteWidth > Index) {
12214 return calculateSrcByte(Op, StartingIndex, Index);
12215 }
12216
12217 return std::nullopt;
12218 }
12219
12220 case ISD::BSWAP: {
12221 if (IsVec)
12222 return std::nullopt;
12223
12224 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12225 Depth + 1, StartingIndex);
12226 }
12227
12229 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12230 if (!IdxOp)
12231 return std::nullopt;
12232 auto VecIdx = IdxOp->getZExtValue();
12233 auto ScalarSize = Op.getScalarValueSizeInBits();
12234 if (ScalarSize < 32)
12235 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12236 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12237 StartingIndex, Index);
12238 }
12239
12240 case AMDGPUISD::PERM: {
12241 if (IsVec)
12242 return std::nullopt;
12243
12244 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12245 if (!PermMask)
12246 return std::nullopt;
12247
12248 auto IdxMask =
12249 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12250 if (IdxMask > 0x07 && IdxMask != 0x0c)
12251 return std::nullopt;
12252
12253 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12254 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12255
12256 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12259 }
12260
12261 default: {
12262 return std::nullopt;
12263 }
12264 }
12265
12266 llvm_unreachable("fully handled switch");
12267}
12268
12269// Returns true if the Operand is a scalar and is 16 bits
12270static bool isExtendedFrom16Bits(SDValue &Operand) {
12271
12272 switch (Operand.getOpcode()) {
12273 case ISD::ANY_EXTEND:
12274 case ISD::SIGN_EXTEND:
12275 case ISD::ZERO_EXTEND: {
12276 auto OpVT = Operand.getOperand(0).getValueType();
12277 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12278 }
12279 case ISD::LOAD: {
12280 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12281 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12282 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12283 ExtType == ISD::EXTLOAD) {
12284 auto MemVT = L->getMemoryVT();
12285 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12286 }
12287 return L->getMemoryVT().getSizeInBits() == 16;
12288 }
12289 default:
12290 return false;
12291 }
12292}
12293
12294// Returns true if the mask matches consecutive bytes, and the first byte
12295// begins at a power of 2 byte offset from 0th byte
12296static bool addresses16Bits(int Mask) {
12297 int Low8 = Mask & 0xff;
12298 int Hi8 = (Mask & 0xff00) >> 8;
12299
12300 assert(Low8 < 8 && Hi8 < 8);
12301 // Are the bytes contiguous in the order of increasing addresses.
12302 bool IsConsecutive = (Hi8 - Low8 == 1);
12303 // Is the first byte at location that is aligned for 16 bit instructions.
12304 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12305 // In this case, we still need code to extract the 16 bit operand, so it
12306 // is better to use i8 v_perm
12307 bool Is16Aligned = !(Low8 % 2);
12308
12309 return IsConsecutive && Is16Aligned;
12310}
12311
12312// Do not lower into v_perm if the operands are actually 16 bit
12313// and the selected bits (based on PermMask) correspond with two
12314// easily addressable 16 bit operands.
12316 SDValue &OtherOp) {
12317 int Low16 = PermMask & 0xffff;
12318 int Hi16 = (PermMask & 0xffff0000) >> 16;
12319
12320 auto TempOp = peekThroughBitcasts(Op);
12321 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12322
12323 auto OpIs16Bit =
12324 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12325 if (!OpIs16Bit)
12326 return true;
12327
12328 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12329 isExtendedFrom16Bits(TempOtherOp);
12330 if (!OtherOpIs16Bit)
12331 return true;
12332
12333 // Do we cleanly address both
12334 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12335}
12336
12338 unsigned DWordOffset) {
12339 SDValue Ret;
12340
12341 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12342 // ByteProvider must be at least 8 bits
12343 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12344
12345 if (TypeSize <= 32)
12346 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12347
12348 if (Src.getValueType().isVector()) {
12349 auto ScalarTySize = Src.getScalarValueSizeInBits();
12350 auto ScalarTy = Src.getValueType().getScalarType();
12351 if (ScalarTySize == 32) {
12352 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12353 DAG.getConstant(DWordOffset, SL, MVT::i32));
12354 }
12355 if (ScalarTySize > 32) {
12356 Ret = DAG.getNode(
12357 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12358 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12359 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12360 if (ShiftVal)
12361 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12362 DAG.getConstant(ShiftVal, SL, MVT::i32));
12363 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12364 }
12365
12366 assert(ScalarTySize < 32);
12367 auto NumElements = TypeSize / ScalarTySize;
12368 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12369 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12370 auto NumElementsIn32 = 32 / ScalarTySize;
12371 auto NumAvailElements = DWordOffset < Trunc32Elements
12372 ? NumElementsIn32
12373 : NumElements - NormalizedTrunc;
12374
12376 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12377 NumAvailElements);
12378
12379 Ret = DAG.getBuildVector(
12380 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12381 VecSrcs);
12382 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12383 }
12384
12385 /// Scalar Type
12386 auto ShiftVal = 32 * DWordOffset;
12387 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12388 DAG.getConstant(ShiftVal, SL, MVT::i32));
12389 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12390}
12391
12393 SelectionDAG &DAG = DCI.DAG;
12394 [[maybe_unused]] EVT VT = N->getValueType(0);
12396
12397 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12398 assert(VT == MVT::i32);
12399 for (int i = 0; i < 4; i++) {
12400 // Find the ByteProvider that provides the ith byte of the result of OR
12401 std::optional<ByteProvider<SDValue>> P =
12402 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12403 // TODO support constantZero
12404 if (!P || P->isConstantZero())
12405 return SDValue();
12406
12407 PermNodes.push_back(*P);
12408 }
12409 if (PermNodes.size() != 4)
12410 return SDValue();
12411
12412 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12413 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12414 uint64_t PermMask = 0x00000000;
12415 for (size_t i = 0; i < PermNodes.size(); i++) {
12416 auto PermOp = PermNodes[i];
12417 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12418 // by sizeof(Src2) = 4
12419 int SrcByteAdjust = 4;
12420
12421 // If the Src uses a byte from a different DWORD, then it corresponds
12422 // with a difference source
12423 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12424 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12425 if (SecondSrc)
12426 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12427 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12428 return SDValue();
12429
12430 // Set the index of the second distinct Src node
12431 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12432 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12433 SrcByteAdjust = 0;
12434 }
12435 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12437 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12438 }
12439 SDLoc DL(N);
12440 SDValue Op = *PermNodes[FirstSrc.first].Src;
12441 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12442 assert(Op.getValueSizeInBits() == 32);
12443
12444 // Check that we are not just extracting the bytes in order from an op
12445 if (!SecondSrc) {
12446 int Low16 = PermMask & 0xffff;
12447 int Hi16 = (PermMask & 0xffff0000) >> 16;
12448
12449 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12450 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12451
12452 // The perm op would really just produce Op. So combine into Op
12453 if (WellFormedLow && WellFormedHi)
12454 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12455 }
12456
12457 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12458
12459 if (SecondSrc) {
12460 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12461 assert(OtherOp.getValueSizeInBits() == 32);
12462 }
12463
12464 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12465
12466 assert(Op.getValueType().isByteSized() &&
12467 OtherOp.getValueType().isByteSized());
12468
12469 // If the ultimate src is less than 32 bits, then we will only be
12470 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12471 // CalculateByteProvider would not have returned Op as source if we
12472 // used a byte that is outside its ValueType. Thus, we are free to
12473 // ANY_EXTEND as the extended bits are dont-cares.
12474 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12475 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12476
12477 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12478 DAG.getConstant(PermMask, DL, MVT::i32));
12479 }
12480 return SDValue();
12481}
12482
12483SDValue SITargetLowering::performOrCombine(SDNode *N,
12484 DAGCombinerInfo &DCI) const {
12485 SelectionDAG &DAG = DCI.DAG;
12486 SDValue LHS = N->getOperand(0);
12487 SDValue RHS = N->getOperand(1);
12488
12489 EVT VT = N->getValueType(0);
12490 if (VT == MVT::i1) {
12491 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12492 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12493 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12494 SDValue Src = LHS.getOperand(0);
12495 if (Src != RHS.getOperand(0))
12496 return SDValue();
12497
12498 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12499 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12500 if (!CLHS || !CRHS)
12501 return SDValue();
12502
12503 // Only 10 bits are used.
12504 static const uint32_t MaxMask = 0x3ff;
12505
12506 uint32_t NewMask =
12507 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12508 SDLoc DL(N);
12509 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12510 DAG.getConstant(NewMask, DL, MVT::i32));
12511 }
12512
12513 return SDValue();
12514 }
12515
12516 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12517 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12518 LHS.getOpcode() == AMDGPUISD::PERM &&
12519 isa<ConstantSDNode>(LHS.getOperand(2))) {
12520 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12521 if (!Sel)
12522 return SDValue();
12523
12524 Sel |= LHS.getConstantOperandVal(2);
12525 SDLoc DL(N);
12526 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12527 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12528 }
12529
12530 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12532 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12533 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12534
12535 // If all the uses of an or need to extract the individual elements, do not
12536 // attempt to lower into v_perm
12537 auto usesCombinedOperand = [](SDNode *OrUse) {
12538 // If we have any non-vectorized use, then it is a candidate for v_perm
12539 if (OrUse->getOpcode() != ISD::BITCAST ||
12540 !OrUse->getValueType(0).isVector())
12541 return true;
12542
12543 // If we have any non-vectorized use, then it is a candidate for v_perm
12544 for (auto *VUser : OrUse->users()) {
12545 if (!VUser->getValueType(0).isVector())
12546 return true;
12547
12548 // If the use of a vector is a store, then combining via a v_perm
12549 // is beneficial.
12550 // TODO -- whitelist more uses
12551 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12552 if (VUser->getOpcode() == VectorwiseOp)
12553 return true;
12554 }
12555 return false;
12556 };
12557
12558 if (!any_of(N->users(), usesCombinedOperand))
12559 return SDValue();
12560
12561 uint32_t LHSMask = getPermuteMask(LHS);
12562 uint32_t RHSMask = getPermuteMask(RHS);
12563
12564 if (LHSMask != ~0u && RHSMask != ~0u) {
12565 // Canonicalize the expression in an attempt to have fewer unique masks
12566 // and therefore fewer registers used to hold the masks.
12567 if (LHSMask > RHSMask) {
12568 std::swap(LHSMask, RHSMask);
12569 std::swap(LHS, RHS);
12570 }
12571
12572 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12573 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12574 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12575 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12576
12577 // Check of we need to combine values from two sources within a byte.
12578 if (!(LHSUsedLanes & RHSUsedLanes) &&
12579 // If we select high and lower word keep it for SDWA.
12580 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12581 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12582 // Kill zero bytes selected by other mask. Zero value is 0xc.
12583 LHSMask &= ~RHSUsedLanes;
12584 RHSMask &= ~LHSUsedLanes;
12585 // Add 4 to each active LHS lane
12586 LHSMask |= LHSUsedLanes & 0x04040404;
12587 // Combine masks
12588 uint32_t Sel = LHSMask | RHSMask;
12589 SDLoc DL(N);
12590
12591 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12592 RHS.getOperand(0),
12593 DAG.getConstant(Sel, DL, MVT::i32));
12594 }
12595 }
12596 if (LHSMask == ~0u || RHSMask == ~0u) {
12597 if (SDValue Perm = matchPERM(N, DCI))
12598 return Perm;
12599 }
12600 }
12601
12602 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12603 return SDValue();
12604
12605 // TODO: This could be a generic combine with a predicate for extracting the
12606 // high half of an integer being free.
12607
12608 // (or i64:x, (zero_extend i32:y)) ->
12609 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12610 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12611 RHS.getOpcode() != ISD::ZERO_EXTEND)
12612 std::swap(LHS, RHS);
12613
12614 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12615 SDValue ExtSrc = RHS.getOperand(0);
12616 EVT SrcVT = ExtSrc.getValueType();
12617 if (SrcVT == MVT::i32) {
12618 SDLoc SL(N);
12619 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12620 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12621
12622 DCI.AddToWorklist(LowOr.getNode());
12623 DCI.AddToWorklist(HiBits.getNode());
12624
12625 SDValue Vec =
12626 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12627 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12628 }
12629 }
12630
12631 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12632 if (CRHS) {
12633 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12634 N->getOperand(0), CRHS))
12635 return Split;
12636 }
12637
12638 return SDValue();
12639}
12640
12641SDValue SITargetLowering::performXorCombine(SDNode *N,
12642 DAGCombinerInfo &DCI) const {
12643 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12644 return RV;
12645
12646 SDValue LHS = N->getOperand(0);
12647 SDValue RHS = N->getOperand(1);
12648
12649 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12650 SelectionDAG &DAG = DCI.DAG;
12651
12652 EVT VT = N->getValueType(0);
12653 if (CRHS && VT == MVT::i64) {
12654 if (SDValue Split =
12655 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12656 return Split;
12657 }
12658
12659 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12660 // fneg-like xors into 64-bit select.
12661 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12662 // This looks like an fneg, try to fold as a source modifier.
12663 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12664 shouldFoldFNegIntoSrc(N, LHS)) {
12665 // xor (select c, a, b), 0x80000000 ->
12666 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12667 SDLoc DL(N);
12668 SDValue CastLHS =
12669 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12670 SDValue CastRHS =
12671 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12672 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12673 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12674 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12675 LHS->getOperand(0), FNegLHS, FNegRHS);
12676 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12677 }
12678 }
12679
12680 return SDValue();
12681}
12682
12683SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12684 DAGCombinerInfo &DCI) const {
12685 if (!Subtarget->has16BitInsts() ||
12686 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12687 return SDValue();
12688
12689 EVT VT = N->getValueType(0);
12690 if (VT != MVT::i32)
12691 return SDValue();
12692
12693 SDValue Src = N->getOperand(0);
12694 if (Src.getValueType() != MVT::i16)
12695 return SDValue();
12696
12697 return SDValue();
12698}
12699
12700SDValue
12701SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12702 DAGCombinerInfo &DCI) const {
12703 SDValue Src = N->getOperand(0);
12704 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12705
12706 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12707 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12708 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12709 VTSign->getVT() == MVT::i8) ||
12710 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12711 VTSign->getVT() == MVT::i16))) {
12712 assert(Subtarget->hasScalarSubwordLoads() &&
12713 "s_buffer_load_{u8, i8} are supported "
12714 "in GFX12 (or newer) architectures.");
12715 EVT VT = Src.getValueType();
12716 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12719 SDLoc DL(N);
12720 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12721 SDValue Ops[] = {
12722 Src.getOperand(0), // source register
12723 Src.getOperand(1), // offset
12724 Src.getOperand(2) // cachePolicy
12725 };
12726 auto *M = cast<MemSDNode>(Src);
12727 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12728 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12729 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12730 return LoadVal;
12731 }
12732 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12733 VTSign->getVT() == MVT::i8) ||
12734 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12735 VTSign->getVT() == MVT::i16)) &&
12736 Src.hasOneUse()) {
12737 auto *M = cast<MemSDNode>(Src);
12738 SDValue Ops[] = {Src.getOperand(0), // Chain
12739 Src.getOperand(1), // rsrc
12740 Src.getOperand(2), // vindex
12741 Src.getOperand(3), // voffset
12742 Src.getOperand(4), // soffset
12743 Src.getOperand(5), // offset
12744 Src.getOperand(6), Src.getOperand(7)};
12745 // replace with BUFFER_LOAD_BYTE/SHORT
12746 SDVTList ResList =
12747 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12748 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12751 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12752 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12753 return DCI.DAG.getMergeValues(
12754 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12755 }
12756 return SDValue();
12757}
12758
12759SDValue SITargetLowering::performClassCombine(SDNode *N,
12760 DAGCombinerInfo &DCI) const {
12761 SelectionDAG &DAG = DCI.DAG;
12762 SDValue Mask = N->getOperand(1);
12763
12764 // fp_class x, 0 -> false
12765 if (isNullConstant(Mask))
12766 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12767
12768 if (N->getOperand(0).isUndef())
12769 return DAG.getUNDEF(MVT::i1);
12770
12771 return SDValue();
12772}
12773
12774SDValue SITargetLowering::performRcpCombine(SDNode *N,
12775 DAGCombinerInfo &DCI) const {
12776 EVT VT = N->getValueType(0);
12777 SDValue N0 = N->getOperand(0);
12778
12779 if (N0.isUndef()) {
12780 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12781 SDLoc(N), VT);
12782 }
12783
12784 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12785 N0.getOpcode() == ISD::SINT_TO_FP)) {
12786 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12787 N->getFlags());
12788 }
12789
12790 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12791 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12792 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12793 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12794 N->getFlags());
12795 }
12796
12798}
12799
12801 unsigned MaxDepth) const {
12802 unsigned Opcode = Op.getOpcode();
12803 if (Opcode == ISD::FCANONICALIZE)
12804 return true;
12805
12806 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12807 const auto &F = CFP->getValueAPF();
12808 if (F.isNaN() && F.isSignaling())
12809 return false;
12810 if (!F.isDenormal())
12811 return true;
12812
12813 DenormalMode Mode =
12814 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12815 return Mode == DenormalMode::getIEEE();
12816 }
12817
12818 // If source is a result of another standard FP operation it is already in
12819 // canonical form.
12820 if (MaxDepth == 0)
12821 return false;
12822
12823 switch (Opcode) {
12824 // These will flush denorms if required.
12825 case ISD::FADD:
12826 case ISD::FSUB:
12827 case ISD::FMUL:
12828 case ISD::FCEIL:
12829 case ISD::FFLOOR:
12830 case ISD::FMA:
12831 case ISD::FMAD:
12832 case ISD::FSQRT:
12833 case ISD::FDIV:
12834 case ISD::FREM:
12835 case ISD::FP_ROUND:
12836 case ISD::FP_EXTEND:
12837 case ISD::FP16_TO_FP:
12838 case ISD::FP_TO_FP16:
12839 case ISD::BF16_TO_FP:
12840 case ISD::FP_TO_BF16:
12841 case ISD::FLDEXP:
12844 case AMDGPUISD::RCP:
12845 case AMDGPUISD::RSQ:
12849 case AMDGPUISD::LOG:
12850 case AMDGPUISD::EXP:
12854 case AMDGPUISD::FRACT:
12861 case AMDGPUISD::SIN_HW:
12862 case AMDGPUISD::COS_HW:
12863 return true;
12864
12865 // It can/will be lowered or combined as a bit operation.
12866 // Need to check their input recursively to handle.
12867 case ISD::FNEG:
12868 case ISD::FABS:
12869 case ISD::FCOPYSIGN:
12870 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12871
12872 case ISD::AND:
12873 if (Op.getValueType() == MVT::i32) {
12874 // Be careful as we only know it is a bitcast floating point type. It
12875 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12876 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12877 // is valid to optimize for all types.
12878 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12879 if (RHS->getZExtValue() == 0xffff0000) {
12880 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12881 }
12882 }
12883 }
12884 break;
12885
12886 case ISD::FSIN:
12887 case ISD::FCOS:
12888 case ISD::FSINCOS:
12889 return Op.getValueType().getScalarType() != MVT::f16;
12890
12891 case ISD::FMINNUM:
12892 case ISD::FMAXNUM:
12893 case ISD::FMINNUM_IEEE:
12894 case ISD::FMAXNUM_IEEE:
12895 case ISD::FMINIMUM:
12896 case ISD::FMAXIMUM:
12897 case AMDGPUISD::CLAMP:
12898 case AMDGPUISD::FMED3:
12899 case AMDGPUISD::FMAX3:
12900 case AMDGPUISD::FMIN3:
12902 case AMDGPUISD::FMINIMUM3: {
12903 // FIXME: Shouldn't treat the generic operations different based these.
12904 // However, we aren't really required to flush the result from
12905 // minnum/maxnum..
12906
12907 // snans will be quieted, so we only need to worry about denormals.
12908 if (Subtarget->supportsMinMaxDenormModes() ||
12909 // FIXME: denormalsEnabledForType is broken for dynamic
12910 denormalsEnabledForType(DAG, Op.getValueType()))
12911 return true;
12912
12913 // Flushing may be required.
12914 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12915 // targets need to check their input recursively.
12916
12917 // FIXME: Does this apply with clamp? It's implemented with max.
12918 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12919 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12920 return false;
12921 }
12922
12923 return true;
12924 }
12925 case ISD::SELECT: {
12926 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12927 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12928 }
12929 case ISD::BUILD_VECTOR: {
12930 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12931 SDValue SrcOp = Op.getOperand(i);
12932 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12933 return false;
12934 }
12935
12936 return true;
12937 }
12940 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12941 }
12943 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12944 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12945 }
12946 case ISD::UNDEF:
12947 // Could be anything.
12948 return false;
12949
12950 case ISD::BITCAST:
12951 // TODO: This is incorrect as it loses track of the operand's type. We may
12952 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12953 // same bits that are canonicalized in one type need not be in the other.
12954 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12955 case ISD::TRUNCATE: {
12956 // Hack round the mess we make when legalizing extract_vector_elt
12957 if (Op.getValueType() == MVT::i16) {
12958 SDValue TruncSrc = Op.getOperand(0);
12959 if (TruncSrc.getValueType() == MVT::i32 &&
12960 TruncSrc.getOpcode() == ISD::BITCAST &&
12961 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12962 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12963 }
12964 }
12965 return false;
12966 }
12968 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12969 // TODO: Handle more intrinsics
12970 switch (IntrinsicID) {
12971 case Intrinsic::amdgcn_cvt_pkrtz:
12972 case Intrinsic::amdgcn_cubeid:
12973 case Intrinsic::amdgcn_frexp_mant:
12974 case Intrinsic::amdgcn_fdot2:
12975 case Intrinsic::amdgcn_rcp:
12976 case Intrinsic::amdgcn_rsq:
12977 case Intrinsic::amdgcn_rsq_clamp:
12978 case Intrinsic::amdgcn_rcp_legacy:
12979 case Intrinsic::amdgcn_rsq_legacy:
12980 case Intrinsic::amdgcn_trig_preop:
12981 case Intrinsic::amdgcn_log:
12982 case Intrinsic::amdgcn_exp2:
12983 case Intrinsic::amdgcn_sqrt:
12984 return true;
12985 default:
12986 break;
12987 }
12988
12989 break;
12990 }
12991 default:
12992 break;
12993 }
12994
12995 // FIXME: denormalsEnabledForType is broken for dynamic
12996 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12997 DAG.isKnownNeverSNaN(Op);
12998}
12999
13001 unsigned MaxDepth) const {
13002 const MachineRegisterInfo &MRI = MF.getRegInfo();
13003 MachineInstr *MI = MRI.getVRegDef(Reg);
13004 unsigned Opcode = MI->getOpcode();
13005
13006 if (Opcode == AMDGPU::G_FCANONICALIZE)
13007 return true;
13008
13009 std::optional<FPValueAndVReg> FCR;
13010 // Constant splat (can be padded with undef) or scalar constant.
13011 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13012 if (FCR->Value.isSignaling())
13013 return false;
13014 if (!FCR->Value.isDenormal())
13015 return true;
13016
13017 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13018 return Mode == DenormalMode::getIEEE();
13019 }
13020
13021 if (MaxDepth == 0)
13022 return false;
13023
13024 switch (Opcode) {
13025 case AMDGPU::G_FADD:
13026 case AMDGPU::G_FSUB:
13027 case AMDGPU::G_FMUL:
13028 case AMDGPU::G_FCEIL:
13029 case AMDGPU::G_FFLOOR:
13030 case AMDGPU::G_FRINT:
13031 case AMDGPU::G_FNEARBYINT:
13032 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13033 case AMDGPU::G_INTRINSIC_TRUNC:
13034 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13035 case AMDGPU::G_FMA:
13036 case AMDGPU::G_FMAD:
13037 case AMDGPU::G_FSQRT:
13038 case AMDGPU::G_FDIV:
13039 case AMDGPU::G_FREM:
13040 case AMDGPU::G_FPOW:
13041 case AMDGPU::G_FPEXT:
13042 case AMDGPU::G_FLOG:
13043 case AMDGPU::G_FLOG2:
13044 case AMDGPU::G_FLOG10:
13045 case AMDGPU::G_FPTRUNC:
13046 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13047 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13048 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13049 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13050 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13051 return true;
13052 case AMDGPU::G_FNEG:
13053 case AMDGPU::G_FABS:
13054 case AMDGPU::G_FCOPYSIGN:
13055 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13056 case AMDGPU::G_FMINNUM:
13057 case AMDGPU::G_FMAXNUM:
13058 case AMDGPU::G_FMINNUM_IEEE:
13059 case AMDGPU::G_FMAXNUM_IEEE:
13060 case AMDGPU::G_FMINIMUM:
13061 case AMDGPU::G_FMAXIMUM: {
13062 if (Subtarget->supportsMinMaxDenormModes() ||
13063 // FIXME: denormalsEnabledForType is broken for dynamic
13064 denormalsEnabledForType(MRI.getType(Reg), MF))
13065 return true;
13066
13067 [[fallthrough]];
13068 }
13069 case AMDGPU::G_BUILD_VECTOR:
13070 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13071 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13072 return false;
13073 return true;
13074 case AMDGPU::G_INTRINSIC:
13075 case AMDGPU::G_INTRINSIC_CONVERGENT:
13076 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13077 case Intrinsic::amdgcn_fmul_legacy:
13078 case Intrinsic::amdgcn_fmad_ftz:
13079 case Intrinsic::amdgcn_sqrt:
13080 case Intrinsic::amdgcn_fmed3:
13081 case Intrinsic::amdgcn_sin:
13082 case Intrinsic::amdgcn_cos:
13083 case Intrinsic::amdgcn_log:
13084 case Intrinsic::amdgcn_exp2:
13085 case Intrinsic::amdgcn_log_clamp:
13086 case Intrinsic::amdgcn_rcp:
13087 case Intrinsic::amdgcn_rcp_legacy:
13088 case Intrinsic::amdgcn_rsq:
13089 case Intrinsic::amdgcn_rsq_clamp:
13090 case Intrinsic::amdgcn_rsq_legacy:
13091 case Intrinsic::amdgcn_div_scale:
13092 case Intrinsic::amdgcn_div_fmas:
13093 case Intrinsic::amdgcn_div_fixup:
13094 case Intrinsic::amdgcn_fract:
13095 case Intrinsic::amdgcn_cvt_pkrtz:
13096 case Intrinsic::amdgcn_cubeid:
13097 case Intrinsic::amdgcn_cubema:
13098 case Intrinsic::amdgcn_cubesc:
13099 case Intrinsic::amdgcn_cubetc:
13100 case Intrinsic::amdgcn_frexp_mant:
13101 case Intrinsic::amdgcn_fdot2:
13102 case Intrinsic::amdgcn_trig_preop:
13103 return true;
13104 default:
13105 break;
13106 }
13107
13108 [[fallthrough]];
13109 default:
13110 return false;
13111 }
13112
13113 llvm_unreachable("invalid operation");
13114}
13115
13116// Constant fold canonicalize.
13117SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13118 const SDLoc &SL, EVT VT,
13119 const APFloat &C) const {
13120 // Flush denormals to 0 if not enabled.
13121 if (C.isDenormal()) {
13122 DenormalMode Mode =
13123 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13124 if (Mode == DenormalMode::getPreserveSign()) {
13125 return DAG.getConstantFP(
13126 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13127 }
13128
13129 if (Mode != DenormalMode::getIEEE())
13130 return SDValue();
13131 }
13132
13133 if (C.isNaN()) {
13134 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13135 if (C.isSignaling()) {
13136 // Quiet a signaling NaN.
13137 // FIXME: Is this supposed to preserve payload bits?
13138 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13139 }
13140
13141 // Make sure it is the canonical NaN bitpattern.
13142 //
13143 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13144 // immediate?
13145 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13146 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13147 }
13148
13149 // Already canonical.
13150 return DAG.getConstantFP(C, SL, VT);
13151}
13152
13154 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13155}
13156
13157SDValue
13158SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13159 DAGCombinerInfo &DCI) const {
13160 SelectionDAG &DAG = DCI.DAG;
13161 SDValue N0 = N->getOperand(0);
13162 EVT VT = N->getValueType(0);
13163
13164 // fcanonicalize undef -> qnan
13165 if (N0.isUndef()) {
13167 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13168 }
13169
13170 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13171 EVT VT = N->getValueType(0);
13172 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13173 }
13174
13175 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13176 // (fcanonicalize k)
13177 //
13178 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13179
13180 // TODO: This could be better with wider vectors that will be split to v2f16,
13181 // and to consider uses since there aren't that many packed operations.
13182 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13183 isTypeLegal(MVT::v2f16)) {
13184 SDLoc SL(N);
13185 SDValue NewElts[2];
13186 SDValue Lo = N0.getOperand(0);
13187 SDValue Hi = N0.getOperand(1);
13188 EVT EltVT = Lo.getValueType();
13189
13191 for (unsigned I = 0; I != 2; ++I) {
13192 SDValue Op = N0.getOperand(I);
13193 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13194 NewElts[I] =
13195 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13196 } else if (Op.isUndef()) {
13197 // Handled below based on what the other operand is.
13198 NewElts[I] = Op;
13199 } else {
13200 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13201 }
13202 }
13203
13204 // If one half is undef, and one is constant, prefer a splat vector rather
13205 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13206 // cheaper to use and may be free with a packed operation.
13207 if (NewElts[0].isUndef()) {
13208 if (isa<ConstantFPSDNode>(NewElts[1]))
13209 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13210 ? NewElts[1]
13211 : DAG.getConstantFP(0.0f, SL, EltVT);
13212 }
13213
13214 if (NewElts[1].isUndef()) {
13215 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13216 ? NewElts[0]
13217 : DAG.getConstantFP(0.0f, SL, EltVT);
13218 }
13219
13220 return DAG.getBuildVector(VT, SL, NewElts);
13221 }
13222 }
13223
13224 return SDValue();
13225}
13226
13227static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13228 switch (Opc) {
13229 case ISD::FMAXNUM:
13230 case ISD::FMAXNUM_IEEE:
13231 return AMDGPUISD::FMAX3;
13232 case ISD::FMAXIMUM:
13233 return AMDGPUISD::FMAXIMUM3;
13234 case ISD::SMAX:
13235 return AMDGPUISD::SMAX3;
13236 case ISD::UMAX:
13237 return AMDGPUISD::UMAX3;
13238 case ISD::FMINNUM:
13239 case ISD::FMINNUM_IEEE:
13240 return AMDGPUISD::FMIN3;
13241 case ISD::FMINIMUM:
13242 return AMDGPUISD::FMINIMUM3;
13243 case ISD::SMIN:
13244 return AMDGPUISD::SMIN3;
13245 case ISD::UMIN:
13246 return AMDGPUISD::UMIN3;
13247 default:
13248 llvm_unreachable("Not a min/max opcode");
13249 }
13250}
13251
13252SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13253 const SDLoc &SL, SDValue Src,
13254 SDValue MinVal,
13255 SDValue MaxVal,
13256 bool Signed) const {
13257
13258 // med3 comes from
13259 // min(max(x, K0), K1), K0 < K1
13260 // max(min(x, K0), K1), K1 < K0
13261 //
13262 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13263 // min/max op.
13264 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13265 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13266
13267 if (!MinK || !MaxK)
13268 return SDValue();
13269
13270 if (Signed) {
13271 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13272 return SDValue();
13273 } else {
13274 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13275 return SDValue();
13276 }
13277
13278 EVT VT = MinK->getValueType(0);
13279 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13280 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13281 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13282
13283 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13284 // not available, but this is unlikely to be profitable as constants
13285 // will often need to be materialized & extended, especially on
13286 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13287 return SDValue();
13288}
13289
13291 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13292 return C;
13293
13294 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13295 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13296 return C;
13297 }
13298
13299 return nullptr;
13300}
13301
13302SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13303 const SDLoc &SL, SDValue Op0,
13304 SDValue Op1) const {
13306 if (!K1)
13307 return SDValue();
13308
13310 if (!K0)
13311 return SDValue();
13312
13313 // Ordered >= (although NaN inputs should have folded away by now).
13314 if (K0->getValueAPF() > K1->getValueAPF())
13315 return SDValue();
13316
13317 const MachineFunction &MF = DAG.getMachineFunction();
13319
13320 // TODO: Check IEEE bit enabled?
13321 EVT VT = Op0.getValueType();
13322 if (Info->getMode().DX10Clamp) {
13323 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13324 // hardware fmed3 behavior converting to a min.
13325 // FIXME: Should this be allowing -0.0?
13326 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13327 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13328 }
13329
13330 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13331 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13332 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13333 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13334 // then give the other result, which is different from med3 with a NaN
13335 // input.
13336 SDValue Var = Op0.getOperand(0);
13337 if (!DAG.isKnownNeverSNaN(Var))
13338 return SDValue();
13339
13341
13342 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13343 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13344 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13345 SDValue(K0, 0), SDValue(K1, 0));
13346 }
13347 }
13348
13349 return SDValue();
13350}
13351
13352/// \return true if the subtarget supports minimum3 and maximum3 with the given
13353/// base min/max opcode \p Opc for type \p VT.
13354static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13355 EVT VT) {
13356 switch (Opc) {
13357 case ISD::FMINNUM:
13358 case ISD::FMAXNUM:
13359 case ISD::FMINNUM_IEEE:
13360 case ISD::FMAXNUM_IEEE:
13363 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13364 case ISD::FMINIMUM:
13365 case ISD::FMAXIMUM:
13366 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13367 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13368 case ISD::SMAX:
13369 case ISD::SMIN:
13370 case ISD::UMAX:
13371 case ISD::UMIN:
13372 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13373 default:
13374 return false;
13375 }
13376
13377 llvm_unreachable("not a min/max opcode");
13378}
13379
13380SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13381 DAGCombinerInfo &DCI) const {
13382 SelectionDAG &DAG = DCI.DAG;
13383
13384 EVT VT = N->getValueType(0);
13385 unsigned Opc = N->getOpcode();
13386 SDValue Op0 = N->getOperand(0);
13387 SDValue Op1 = N->getOperand(1);
13388
13389 // Only do this if the inner op has one use since this will just increases
13390 // register pressure for no benefit.
13391
13392 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13393 // max(max(a, b), c) -> max3(a, b, c)
13394 // min(min(a, b), c) -> min3(a, b, c)
13395 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13396 SDLoc DL(N);
13397 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13398 Op0.getOperand(0), Op0.getOperand(1), Op1);
13399 }
13400
13401 // Try commuted.
13402 // max(a, max(b, c)) -> max3(a, b, c)
13403 // min(a, min(b, c)) -> min3(a, b, c)
13404 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13405 SDLoc DL(N);
13406 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13407 Op0, Op1.getOperand(0), Op1.getOperand(1));
13408 }
13409 }
13410
13411 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13412 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13413 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13414 if (SDValue Med3 = performIntMed3ImmCombine(
13415 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13416 return Med3;
13417 }
13418 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13419 if (SDValue Med3 = performIntMed3ImmCombine(
13420 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13421 return Med3;
13422 }
13423
13424 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13425 if (SDValue Med3 = performIntMed3ImmCombine(
13426 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13427 return Med3;
13428 }
13429 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13430 if (SDValue Med3 = performIntMed3ImmCombine(
13431 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13432 return Med3;
13433 }
13434
13435 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13436 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13437 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13438 (Opc == AMDGPUISD::FMIN_LEGACY &&
13439 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13440 (VT == MVT::f32 || VT == MVT::f64 ||
13441 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13442 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13443 Op0.hasOneUse()) {
13444 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13445 return Res;
13446 }
13447
13448 return SDValue();
13449}
13450
13452 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13453 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13454 // FIXME: Should this be allowing -0.0?
13455 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13456 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13457 }
13458 }
13459
13460 return false;
13461}
13462
13463// FIXME: Should only worry about snans for version with chain.
13464SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13465 DAGCombinerInfo &DCI) const {
13466 EVT VT = N->getValueType(0);
13467 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13468 // NaNs. With a NaN input, the order of the operands may change the result.
13469
13470 SelectionDAG &DAG = DCI.DAG;
13471 SDLoc SL(N);
13472
13473 SDValue Src0 = N->getOperand(0);
13474 SDValue Src1 = N->getOperand(1);
13475 SDValue Src2 = N->getOperand(2);
13476
13477 if (isClampZeroToOne(Src0, Src1)) {
13478 // const_a, const_b, x -> clamp is safe in all cases including signaling
13479 // nans.
13480 // FIXME: Should this be allowing -0.0?
13481 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13482 }
13483
13484 const MachineFunction &MF = DAG.getMachineFunction();
13486
13487 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13488 // handling no dx10-clamp?
13489 if (Info->getMode().DX10Clamp) {
13490 // If NaNs is clamped to 0, we are free to reorder the inputs.
13491
13492 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13493 std::swap(Src0, Src1);
13494
13495 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13496 std::swap(Src1, Src2);
13497
13498 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13499 std::swap(Src0, Src1);
13500
13501 if (isClampZeroToOne(Src1, Src2))
13502 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13503 }
13504
13505 return SDValue();
13506}
13507
13508SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13509 DAGCombinerInfo &DCI) const {
13510 SDValue Src0 = N->getOperand(0);
13511 SDValue Src1 = N->getOperand(1);
13512 if (Src0.isUndef() && Src1.isUndef())
13513 return DCI.DAG.getUNDEF(N->getValueType(0));
13514 return SDValue();
13515}
13516
13517// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13518// expanded into a set of cmp/select instructions.
13520 unsigned NumElem,
13521 bool IsDivergentIdx,
13522 const GCNSubtarget *Subtarget) {
13524 return false;
13525
13526 unsigned VecSize = EltSize * NumElem;
13527
13528 // Sub-dword vectors of size 2 dword or less have better implementation.
13529 if (VecSize <= 64 && EltSize < 32)
13530 return false;
13531
13532 // Always expand the rest of sub-dword instructions, otherwise it will be
13533 // lowered via memory.
13534 if (EltSize < 32)
13535 return true;
13536
13537 // Always do this if var-idx is divergent, otherwise it will become a loop.
13538 if (IsDivergentIdx)
13539 return true;
13540
13541 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13542 unsigned NumInsts = NumElem /* Number of compares */ +
13543 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13544
13545 // On some architectures (GFX9) movrel is not available and it's better
13546 // to expand.
13547 if (Subtarget->useVGPRIndexMode())
13548 return NumInsts <= 16;
13549
13550 // If movrel is available, use it instead of expanding for vector of 8
13551 // elements.
13552 if (Subtarget->hasMovrel())
13553 return NumInsts <= 15;
13554
13555 return true;
13556}
13557
13559 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13560 if (isa<ConstantSDNode>(Idx))
13561 return false;
13562
13563 SDValue Vec = N->getOperand(0);
13564 EVT VecVT = Vec.getValueType();
13565 EVT EltVT = VecVT.getVectorElementType();
13566 unsigned EltSize = EltVT.getSizeInBits();
13567 unsigned NumElem = VecVT.getVectorNumElements();
13568
13570 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13571}
13572
13573SDValue
13574SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13575 DAGCombinerInfo &DCI) const {
13576 SDValue Vec = N->getOperand(0);
13577 SelectionDAG &DAG = DCI.DAG;
13578
13579 EVT VecVT = Vec.getValueType();
13580 EVT VecEltVT = VecVT.getVectorElementType();
13581 EVT ResVT = N->getValueType(0);
13582
13583 unsigned VecSize = VecVT.getSizeInBits();
13584 unsigned VecEltSize = VecEltVT.getSizeInBits();
13585
13586 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13588 SDLoc SL(N);
13589 SDValue Idx = N->getOperand(1);
13590 SDValue Elt =
13591 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13592 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13593 }
13594
13595 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13596 // =>
13597 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13598 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13599 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13600 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13601 SDLoc SL(N);
13602 SDValue Idx = N->getOperand(1);
13603 unsigned Opc = Vec.getOpcode();
13604
13605 switch (Opc) {
13606 default:
13607 break;
13608 // TODO: Support other binary operations.
13609 case ISD::FADD:
13610 case ISD::FSUB:
13611 case ISD::FMUL:
13612 case ISD::ADD:
13613 case ISD::UMIN:
13614 case ISD::UMAX:
13615 case ISD::SMIN:
13616 case ISD::SMAX:
13617 case ISD::FMAXNUM:
13618 case ISD::FMINNUM:
13619 case ISD::FMAXNUM_IEEE:
13620 case ISD::FMINNUM_IEEE:
13621 case ISD::FMAXIMUM:
13622 case ISD::FMINIMUM: {
13623 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13624 Vec.getOperand(0), Idx);
13625 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13626 Vec.getOperand(1), Idx);
13627
13628 DCI.AddToWorklist(Elt0.getNode());
13629 DCI.AddToWorklist(Elt1.getNode());
13630 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13631 }
13632 }
13633 }
13634
13635 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13637 SDLoc SL(N);
13638 SDValue Idx = N->getOperand(1);
13639 SDValue V;
13640 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13641 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13642 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13643 if (I == 0)
13644 V = Elt;
13645 else
13646 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13647 }
13648 return V;
13649 }
13650
13651 if (!DCI.isBeforeLegalize())
13652 return SDValue();
13653
13654 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13655 // elements. This exposes more load reduction opportunities by replacing
13656 // multiple small extract_vector_elements with a single 32-bit extract.
13657 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13658 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13659 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13660 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13661
13662 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13663 unsigned EltIdx = BitIndex / 32;
13664 unsigned LeftoverBitIdx = BitIndex % 32;
13665 SDLoc SL(N);
13666
13667 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13668 DCI.AddToWorklist(Cast.getNode());
13669
13670 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13671 DAG.getConstant(EltIdx, SL, MVT::i32));
13672 DCI.AddToWorklist(Elt.getNode());
13673 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13674 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13675 DCI.AddToWorklist(Srl.getNode());
13676
13677 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13678 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13679 DCI.AddToWorklist(Trunc.getNode());
13680
13681 if (VecEltVT == ResVT) {
13682 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13683 }
13684
13685 assert(ResVT.isScalarInteger());
13686 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13687 }
13688
13689 return SDValue();
13690}
13691
13692SDValue
13693SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13694 DAGCombinerInfo &DCI) const {
13695 SDValue Vec = N->getOperand(0);
13696 SDValue Idx = N->getOperand(2);
13697 EVT VecVT = Vec.getValueType();
13698 EVT EltVT = VecVT.getVectorElementType();
13699
13700 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13701 // => BUILD_VECTOR n x select (e, const-idx)
13703 return SDValue();
13704
13705 SelectionDAG &DAG = DCI.DAG;
13706 SDLoc SL(N);
13707 SDValue Ins = N->getOperand(1);
13708 EVT IdxVT = Idx.getValueType();
13709
13711 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13712 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13713 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13714 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13715 Ops.push_back(V);
13716 }
13717
13718 return DAG.getBuildVector(VecVT, SL, Ops);
13719}
13720
13721/// Return the source of an fp_extend from f16 to f32, or a converted FP
13722/// constant.
13724 if (Src.getOpcode() == ISD::FP_EXTEND &&
13725 Src.getOperand(0).getValueType() == MVT::f16) {
13726 return Src.getOperand(0);
13727 }
13728
13729 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13730 APFloat Val = CFP->getValueAPF();
13731 bool LosesInfo = true;
13733 if (!LosesInfo)
13734 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13735 }
13736
13737 return SDValue();
13738}
13739
13740SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13741 DAGCombinerInfo &DCI) const {
13742 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13743 "combine only useful on gfx8");
13744
13745 SDValue TruncSrc = N->getOperand(0);
13746 EVT VT = N->getValueType(0);
13747 if (VT != MVT::f16)
13748 return SDValue();
13749
13750 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13751 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13752 return SDValue();
13753
13754 SelectionDAG &DAG = DCI.DAG;
13755 SDLoc SL(N);
13756
13757 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13758 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13759 // casting back.
13760
13761 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13762 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13763 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13764 if (!A)
13765 return SDValue();
13766
13767 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13768 if (!B)
13769 return SDValue();
13770
13771 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13772 if (!C)
13773 return SDValue();
13774
13775 // This changes signaling nan behavior. If an input is a signaling nan, it
13776 // would have been quieted by the fpext originally. We don't care because
13777 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13778 // we would be worse off than just doing the promotion.
13779 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13780 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13781 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13782 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13783}
13784
13785unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13786 const SDNode *N0,
13787 const SDNode *N1) const {
13788 EVT VT = N0->getValueType(0);
13789
13790 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13791 // support denormals ever.
13792 if (((VT == MVT::f32 &&
13794 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13797 return ISD::FMAD;
13798
13799 const TargetOptions &Options = DAG.getTarget().Options;
13800 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13801 (N0->getFlags().hasAllowContract() &&
13802 N1->getFlags().hasAllowContract())) &&
13804 return ISD::FMA;
13805 }
13806
13807 return 0;
13808}
13809
13810// For a reassociatable opcode perform:
13811// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13812SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13813 SelectionDAG &DAG) const {
13814 EVT VT = N->getValueType(0);
13815 if (VT != MVT::i32 && VT != MVT::i64)
13816 return SDValue();
13817
13818 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13819 return SDValue();
13820
13821 unsigned Opc = N->getOpcode();
13822 SDValue Op0 = N->getOperand(0);
13823 SDValue Op1 = N->getOperand(1);
13824
13825 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13826 return SDValue();
13827
13828 if (Op0->isDivergent())
13829 std::swap(Op0, Op1);
13830
13831 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13832 return SDValue();
13833
13834 SDValue Op2 = Op1.getOperand(1);
13835 Op1 = Op1.getOperand(0);
13836 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13837 return SDValue();
13838
13839 if (Op1->isDivergent())
13840 std::swap(Op1, Op2);
13841
13842 SDLoc SL(N);
13843 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13844 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13845}
13846
13847static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13848 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13850 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13851 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13852 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13853}
13854
13855// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13856// multiplies, if any.
13857//
13858// Full 64-bit multiplies that feed into an addition are lowered here instead
13859// of using the generic expansion. The generic expansion ends up with
13860// a tree of ADD nodes that prevents us from using the "add" part of the
13861// MAD instruction. The expansion produced here results in a chain of ADDs
13862// instead of a tree.
13863SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13864 DAGCombinerInfo &DCI) const {
13865 assert(N->getOpcode() == ISD::ADD);
13866
13867 SelectionDAG &DAG = DCI.DAG;
13868 EVT VT = N->getValueType(0);
13869 SDLoc SL(N);
13870 SDValue LHS = N->getOperand(0);
13871 SDValue RHS = N->getOperand(1);
13872
13873 if (VT.isVector())
13874 return SDValue();
13875
13876 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13877 // result in scalar registers for uniform values.
13878 if (!N->isDivergent() && Subtarget->hasSMulHi())
13879 return SDValue();
13880
13881 unsigned NumBits = VT.getScalarSizeInBits();
13882 if (NumBits <= 32 || NumBits > 64)
13883 return SDValue();
13884
13885 if (LHS.getOpcode() != ISD::MUL) {
13886 assert(RHS.getOpcode() == ISD::MUL);
13887 std::swap(LHS, RHS);
13888 }
13889
13890 // Avoid the fold if it would unduly increase the number of multiplies due to
13891 // multiple uses, except on hardware with full-rate multiply-add (which is
13892 // part of full-rate 64-bit ops).
13893 if (!Subtarget->hasFullRate64Ops()) {
13894 unsigned NumUsers = 0;
13895 for (SDNode *User : LHS->users()) {
13896 // There is a use that does not feed into addition, so the multiply can't
13897 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13898 if (User->getOpcode() != ISD::ADD)
13899 return SDValue();
13900
13901 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13902 // MUL + 3xADD + 3xADDC over 3xMAD.
13903 ++NumUsers;
13904 if (NumUsers >= 3)
13905 return SDValue();
13906 }
13907 }
13908
13909 SDValue MulLHS = LHS.getOperand(0);
13910 SDValue MulRHS = LHS.getOperand(1);
13911 SDValue AddRHS = RHS;
13912
13913 // Always check whether operands are small unsigned values, since that
13914 // knowledge is useful in more cases. Check for small signed values only if
13915 // doing so can unlock a shorter code sequence.
13916 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13917 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13918
13919 bool MulSignedLo = false;
13920 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13921 MulSignedLo =
13922 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
13923 }
13924
13925 // The operands and final result all have the same number of bits. If
13926 // operands need to be extended, they can be extended with garbage. The
13927 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13928 // truncated away in the end.
13929 if (VT != MVT::i64) {
13930 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13931 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13932 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13933 }
13934
13935 // The basic code generated is conceptually straightforward. Pseudo code:
13936 //
13937 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13938 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13939 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13940 //
13941 // The second and third lines are optional, depending on whether the factors
13942 // are {sign,zero}-extended or not.
13943 //
13944 // The actual DAG is noisier than the pseudo code, but only due to
13945 // instructions that disassemble values into low and high parts, and
13946 // assemble the final result.
13947 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13948
13949 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13950 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13951 SDValue Accum =
13952 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13953
13954 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13955 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13956
13957 if (!MulLHSUnsigned32) {
13958 auto MulLHSHi =
13959 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13960 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13961 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13962 }
13963
13964 if (!MulRHSUnsigned32) {
13965 auto MulRHSHi =
13966 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13967 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13968 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13969 }
13970
13971 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13972 Accum = DAG.getBitcast(MVT::i64, Accum);
13973 }
13974
13975 if (VT != MVT::i64)
13976 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13977 return Accum;
13978}
13979
13980// Collect the ultimate src of each of the mul node's operands, and confirm
13981// each operand is 8 bytes.
13982static std::optional<ByteProvider<SDValue>>
13983handleMulOperand(const SDValue &MulOperand) {
13984 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13985 if (!Byte0 || Byte0->isConstantZero()) {
13986 return std::nullopt;
13987 }
13988 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13989 if (Byte1 && !Byte1->isConstantZero()) {
13990 return std::nullopt;
13991 }
13992 return Byte0;
13993}
13994
13995static unsigned addPermMasks(unsigned First, unsigned Second) {
13996 unsigned FirstCs = First & 0x0c0c0c0c;
13997 unsigned SecondCs = Second & 0x0c0c0c0c;
13998 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13999 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14000
14001 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14002 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14003 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14004 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14005
14006 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14007}
14008
14009struct DotSrc {
14011 int64_t PermMask;
14013};
14014
14018 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14019
14020 assert(Src0.Src.has_value() && Src1.Src.has_value());
14021 // Src0s and Src1s are empty, just place arbitrarily.
14022 if (Step == 0) {
14023 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14024 Src0.SrcOffset / 4});
14025 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14026 Src1.SrcOffset / 4});
14027 return;
14028 }
14029
14030 for (int BPI = 0; BPI < 2; BPI++) {
14031 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14032 if (BPI == 1) {
14033 BPP = {Src1, Src0};
14034 }
14035 unsigned ZeroMask = 0x0c0c0c0c;
14036 unsigned FMask = 0xFF << (8 * (3 - Step));
14037
14038 unsigned FirstMask =
14039 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14040 unsigned SecondMask =
14041 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14042 // Attempt to find Src vector which contains our SDValue, if so, add our
14043 // perm mask to the existing one. If we are unable to find a match for the
14044 // first SDValue, attempt to find match for the second.
14045 int FirstGroup = -1;
14046 for (int I = 0; I < 2; I++) {
14047 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14048 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14049 return IterElt.SrcOp == *BPP.first.Src &&
14050 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14051 };
14052
14053 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14054 if (Match != Srcs.end()) {
14055 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14056 FirstGroup = I;
14057 break;
14058 }
14059 }
14060 if (FirstGroup != -1) {
14061 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14062 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14063 return IterElt.SrcOp == *BPP.second.Src &&
14064 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14065 };
14066 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14067 if (Match != Srcs.end()) {
14068 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14069 } else
14070 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14071 return;
14072 }
14073 }
14074
14075 // If we have made it here, then we could not find a match in Src0s or Src1s
14076 // for either Src0 or Src1, so just place them arbitrarily.
14077
14078 unsigned ZeroMask = 0x0c0c0c0c;
14079 unsigned FMask = 0xFF << (8 * (3 - Step));
14080
14081 Src0s.push_back(
14082 {*Src0.Src,
14083 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14084 Src0.SrcOffset / 4});
14085 Src1s.push_back(
14086 {*Src1.Src,
14087 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14088 Src1.SrcOffset / 4});
14089}
14090
14092 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14093 bool IsAny) {
14094
14095 // If we just have one source, just permute it accordingly.
14096 if (Srcs.size() == 1) {
14097 auto *Elt = Srcs.begin();
14098 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14099
14100 // v_perm will produce the original value
14101 if (Elt->PermMask == 0x3020100)
14102 return EltOp;
14103
14104 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14105 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14106 }
14107
14108 auto *FirstElt = Srcs.begin();
14109 auto *SecondElt = std::next(FirstElt);
14110
14112
14113 // If we have multiple sources in the chain, combine them via perms (using
14114 // calculated perm mask) and Ors.
14115 while (true) {
14116 auto FirstMask = FirstElt->PermMask;
14117 auto SecondMask = SecondElt->PermMask;
14118
14119 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14120 unsigned FirstPlusFour = FirstMask | 0x04040404;
14121 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14122 // original 0x0C.
14123 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14124
14125 auto PermMask = addPermMasks(FirstMask, SecondMask);
14126 auto FirstVal =
14127 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14128 auto SecondVal =
14129 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14130
14131 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14132 SecondVal,
14133 DAG.getConstant(PermMask, SL, MVT::i32)));
14134
14135 FirstElt = std::next(SecondElt);
14136 if (FirstElt == Srcs.end())
14137 break;
14138
14139 SecondElt = std::next(FirstElt);
14140 // If we only have a FirstElt, then just combine that into the cumulative
14141 // source node.
14142 if (SecondElt == Srcs.end()) {
14143 auto EltOp =
14144 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14145
14146 Perms.push_back(
14147 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14148 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14149 break;
14150 }
14151 }
14152
14153 assert(Perms.size() == 1 || Perms.size() == 2);
14154 return Perms.size() == 2
14155 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14156 : Perms[0];
14157}
14158
14159static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14160 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14161 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14162 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14163 EntryMask += ZeroMask;
14164 }
14165}
14166
14167static bool isMul(const SDValue Op) {
14168 auto Opcode = Op.getOpcode();
14169
14170 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14171 Opcode == AMDGPUISD::MUL_I24);
14172}
14173
14174static std::optional<bool>
14176 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14177 const SDValue &S1Op, const SelectionDAG &DAG) {
14178 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14179 // of the dot4 is irrelevant.
14180 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14181 return false;
14182
14183 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14184 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14185 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14186 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14187 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14188 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14189
14190 assert(!(S0IsUnsigned && S0IsSigned));
14191 assert(!(S1IsUnsigned && S1IsSigned));
14192
14193 // There are 9 possible permutations of
14194 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14195
14196 // In two permutations, the sign bits are known to be the same for both Ops,
14197 // so simply return Signed / Unsigned corresponding to the MSB
14198
14199 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14200 return S0IsSigned;
14201
14202 // In another two permutations, the sign bits are known to be opposite. In
14203 // this case return std::nullopt to indicate a bad match.
14204
14205 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14206 return std::nullopt;
14207
14208 // In the remaining five permutations, we don't know the value of the sign
14209 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14210 // the upper bits must be extension bits. Thus, the only ways for the sign
14211 // bit to be unknown is if it was sign extended from unknown value, or if it
14212 // was any extended. In either case, it is correct to use the signed
14213 // version of the signedness semantics of dot4
14214
14215 // In two of such permutations, we known the sign bit is set for
14216 // one op, and the other is unknown. It is okay to used signed version of
14217 // dot4.
14218 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14219 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14220 return true;
14221
14222 // In one such permutation, we don't know either of the sign bits. It is okay
14223 // to used the signed version of dot4.
14224 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14225 return true;
14226
14227 // In two of such permutations, we known the sign bit is unset for
14228 // one op, and the other is unknown. Return std::nullopt to indicate a
14229 // bad match.
14230 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14231 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14232 return std::nullopt;
14233
14234 llvm_unreachable("Fully covered condition");
14235}
14236
14237SDValue SITargetLowering::performAddCombine(SDNode *N,
14238 DAGCombinerInfo &DCI) const {
14239 SelectionDAG &DAG = DCI.DAG;
14240 EVT VT = N->getValueType(0);
14241 SDLoc SL(N);
14242 SDValue LHS = N->getOperand(0);
14243 SDValue RHS = N->getOperand(1);
14244
14245 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14246 if (Subtarget->hasMad64_32()) {
14247 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14248 return Folded;
14249 }
14250 }
14251
14252 if (SDValue V = reassociateScalarOps(N, DAG)) {
14253 return V;
14254 }
14255
14256 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14257 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14258 SDValue TempNode(N, 0);
14259 std::optional<bool> IsSigned;
14263
14264 // Match the v_dot4 tree, while collecting src nodes.
14265 int ChainLength = 0;
14266 for (int I = 0; I < 4; I++) {
14267 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14268 if (MulIdx == -1)
14269 break;
14270 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14271 if (!Src0)
14272 break;
14273 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14274 if (!Src1)
14275 break;
14276
14277 auto IterIsSigned = checkDot4MulSignedness(
14278 TempNode->getOperand(MulIdx), *Src0, *Src1,
14279 TempNode->getOperand(MulIdx)->getOperand(0),
14280 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14281 if (!IterIsSigned)
14282 break;
14283 if (!IsSigned)
14284 IsSigned = *IterIsSigned;
14285 if (*IterIsSigned != *IsSigned)
14286 break;
14287 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14288 auto AddIdx = 1 - MulIdx;
14289 // Allow the special case where add (add (mul24, 0), mul24) became ->
14290 // add (mul24, mul24).
14291 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14292 Src2s.push_back(TempNode->getOperand(AddIdx));
14293 auto Src0 =
14294 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14295 if (!Src0)
14296 break;
14297 auto Src1 =
14298 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14299 if (!Src1)
14300 break;
14301 auto IterIsSigned = checkDot4MulSignedness(
14302 TempNode->getOperand(AddIdx), *Src0, *Src1,
14303 TempNode->getOperand(AddIdx)->getOperand(0),
14304 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14305 if (!IterIsSigned)
14306 break;
14307 assert(IsSigned);
14308 if (*IterIsSigned != *IsSigned)
14309 break;
14310 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14311 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14312 ChainLength = I + 2;
14313 break;
14314 }
14315
14316 TempNode = TempNode->getOperand(AddIdx);
14317 Src2s.push_back(TempNode);
14318 ChainLength = I + 1;
14319 if (TempNode->getNumOperands() < 2)
14320 break;
14321 LHS = TempNode->getOperand(0);
14322 RHS = TempNode->getOperand(1);
14323 }
14324
14325 if (ChainLength < 2)
14326 return SDValue();
14327
14328 // Masks were constructed with assumption that we would find a chain of
14329 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14330 // 0x0c) so they do not affect dot calculation.
14331 if (ChainLength < 4) {
14332 fixMasks(Src0s, ChainLength);
14333 fixMasks(Src1s, ChainLength);
14334 }
14335
14336 SDValue Src0, Src1;
14337
14338 // If we are just using a single source for both, and have permuted the
14339 // bytes consistently, we can just use the sources without permuting
14340 // (commutation).
14341 bool UseOriginalSrc = false;
14342 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14343 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14344 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14345 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14346 SmallVector<unsigned, 4> SrcBytes;
14347 auto Src0Mask = Src0s.begin()->PermMask;
14348 SrcBytes.push_back(Src0Mask & 0xFF000000);
14349 bool UniqueEntries = true;
14350 for (auto I = 1; I < 4; I++) {
14351 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14352
14353 if (is_contained(SrcBytes, NextByte)) {
14354 UniqueEntries = false;
14355 break;
14356 }
14357 SrcBytes.push_back(NextByte);
14358 }
14359
14360 if (UniqueEntries) {
14361 UseOriginalSrc = true;
14362
14363 auto *FirstElt = Src0s.begin();
14364 auto FirstEltOp =
14365 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14366
14367 auto *SecondElt = Src1s.begin();
14368 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14369 SecondElt->DWordOffset);
14370
14371 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14372 MVT::getIntegerVT(32));
14373 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14374 MVT::getIntegerVT(32));
14375 }
14376 }
14377
14378 if (!UseOriginalSrc) {
14379 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14380 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14381 }
14382
14383 assert(IsSigned);
14384 SDValue Src2 =
14385 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14386
14387 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14388 : Intrinsic::amdgcn_udot4,
14389 SL, MVT::i64);
14390
14391 assert(!VT.isVector());
14392 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14393 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14394
14395 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14396 }
14397
14398 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14399 return SDValue();
14400
14401 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14402 // add x, sext (setcc) => usubo_carry x, 0, setcc
14403 unsigned Opc = LHS.getOpcode();
14404 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14405 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14406 std::swap(RHS, LHS);
14407
14408 Opc = RHS.getOpcode();
14409 switch (Opc) {
14410 default:
14411 break;
14412 case ISD::ZERO_EXTEND:
14413 case ISD::SIGN_EXTEND:
14414 case ISD::ANY_EXTEND: {
14415 auto Cond = RHS.getOperand(0);
14416 // If this won't be a real VOPC output, we would still need to insert an
14417 // extra instruction anyway.
14418 if (!isBoolSGPR(Cond))
14419 break;
14420 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14421 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14423 return DAG.getNode(Opc, SL, VTList, Args);
14424 }
14425 case ISD::UADDO_CARRY: {
14426 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14427 if (!isNullConstant(RHS.getOperand(1)))
14428 break;
14429 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14430 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14431 }
14432 }
14433 return SDValue();
14434}
14435
14436SDValue SITargetLowering::performSubCombine(SDNode *N,
14437 DAGCombinerInfo &DCI) const {
14438 SelectionDAG &DAG = DCI.DAG;
14439 EVT VT = N->getValueType(0);
14440
14441 if (VT != MVT::i32)
14442 return SDValue();
14443
14444 SDLoc SL(N);
14445 SDValue LHS = N->getOperand(0);
14446 SDValue RHS = N->getOperand(1);
14447
14448 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14449 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14450 unsigned Opc = RHS.getOpcode();
14451 switch (Opc) {
14452 default:
14453 break;
14454 case ISD::ZERO_EXTEND:
14455 case ISD::SIGN_EXTEND:
14456 case ISD::ANY_EXTEND: {
14457 auto Cond = RHS.getOperand(0);
14458 // If this won't be a real VOPC output, we would still need to insert an
14459 // extra instruction anyway.
14460 if (!isBoolSGPR(Cond))
14461 break;
14462 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14463 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14465 return DAG.getNode(Opc, SL, VTList, Args);
14466 }
14467 }
14468
14469 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14470 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14471 if (!isNullConstant(LHS.getOperand(1)))
14472 return SDValue();
14473 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14474 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14475 }
14476 return SDValue();
14477}
14478
14479SDValue
14480SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14481 DAGCombinerInfo &DCI) const {
14482
14483 if (N->getValueType(0) != MVT::i32)
14484 return SDValue();
14485
14486 if (!isNullConstant(N->getOperand(1)))
14487 return SDValue();
14488
14489 SelectionDAG &DAG = DCI.DAG;
14490 SDValue LHS = N->getOperand(0);
14491
14492 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14493 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14494 unsigned LHSOpc = LHS.getOpcode();
14495 unsigned Opc = N->getOpcode();
14496 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14497 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14498 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14499 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14500 }
14501 return SDValue();
14502}
14503
14504SDValue SITargetLowering::performFAddCombine(SDNode *N,
14505 DAGCombinerInfo &DCI) const {
14506 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14507 return SDValue();
14508
14509 SelectionDAG &DAG = DCI.DAG;
14510 EVT VT = N->getValueType(0);
14511
14512 SDLoc SL(N);
14513 SDValue LHS = N->getOperand(0);
14514 SDValue RHS = N->getOperand(1);
14515
14516 // These should really be instruction patterns, but writing patterns with
14517 // source modifiers is a pain.
14518
14519 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14520 if (LHS.getOpcode() == ISD::FADD) {
14521 SDValue A = LHS.getOperand(0);
14522 if (A == LHS.getOperand(1)) {
14523 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14524 if (FusedOp != 0) {
14525 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14526 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14527 }
14528 }
14529 }
14530
14531 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14532 if (RHS.getOpcode() == ISD::FADD) {
14533 SDValue A = RHS.getOperand(0);
14534 if (A == RHS.getOperand(1)) {
14535 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14536 if (FusedOp != 0) {
14537 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14538 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14539 }
14540 }
14541 }
14542
14543 return SDValue();
14544}
14545
14546SDValue SITargetLowering::performFSubCombine(SDNode *N,
14547 DAGCombinerInfo &DCI) const {
14548 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14549 return SDValue();
14550
14551 SelectionDAG &DAG = DCI.DAG;
14552 SDLoc SL(N);
14553 EVT VT = N->getValueType(0);
14554 assert(!VT.isVector());
14555
14556 // Try to get the fneg to fold into the source modifier. This undoes generic
14557 // DAG combines and folds them into the mad.
14558 //
14559 // Only do this if we are not trying to support denormals. v_mad_f32 does
14560 // not support denormals ever.
14561 SDValue LHS = N->getOperand(0);
14562 SDValue RHS = N->getOperand(1);
14563 if (LHS.getOpcode() == ISD::FADD) {
14564 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14565 SDValue A = LHS.getOperand(0);
14566 if (A == LHS.getOperand(1)) {
14567 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14568 if (FusedOp != 0) {
14569 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14570 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14571
14572 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14573 }
14574 }
14575 }
14576
14577 if (RHS.getOpcode() == ISD::FADD) {
14578 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14579
14580 SDValue A = RHS.getOperand(0);
14581 if (A == RHS.getOperand(1)) {
14582 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14583 if (FusedOp != 0) {
14584 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14585 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14586 }
14587 }
14588 }
14589
14590 return SDValue();
14591}
14592
14593SDValue SITargetLowering::performFDivCombine(SDNode *N,
14594 DAGCombinerInfo &DCI) const {
14595 SelectionDAG &DAG = DCI.DAG;
14596 SDLoc SL(N);
14597 EVT VT = N->getValueType(0);
14598 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14599 return SDValue();
14600
14601 SDValue LHS = N->getOperand(0);
14602 SDValue RHS = N->getOperand(1);
14603
14604 SDNodeFlags Flags = N->getFlags();
14605 SDNodeFlags RHSFlags = RHS->getFlags();
14606 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14607 !RHS->hasOneUse())
14608 return SDValue();
14609
14610 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14611 bool IsNegative = false;
14612 if (CLHS->isExactlyValue(1.0) ||
14613 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14614 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14615 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14616 if (RHS.getOpcode() == ISD::FSQRT) {
14617 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14618 SDValue Rsq =
14619 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14620 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14621 }
14622 }
14623 }
14624
14625 return SDValue();
14626}
14627
14628SDValue SITargetLowering::performFMulCombine(SDNode *N,
14629 DAGCombinerInfo &DCI) const {
14630 SelectionDAG &DAG = DCI.DAG;
14631 EVT VT = N->getValueType(0);
14632 EVT ScalarVT = VT.getScalarType();
14633 EVT IntVT = VT.changeElementType(MVT::i32);
14634
14635 SDValue LHS = N->getOperand(0);
14636 SDValue RHS = N->getOperand(1);
14637
14638 // It is cheaper to realize i32 inline constants as compared against
14639 // materializing f16 or f64 (or even non-inline f32) values,
14640 // possible via ldexp usage, as shown below :
14641 //
14642 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14643 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14644 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14645 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14646 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14647 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14648 if (!TrueNode)
14649 return SDValue();
14650 const ConstantFPSDNode *FalseNode =
14651 isConstOrConstSplatFP(RHS.getOperand(2));
14652 if (!FalseNode)
14653 return SDValue();
14654
14655 if (TrueNode->isNegative() != FalseNode->isNegative())
14656 return SDValue();
14657
14658 // For f32, only non-inline constants should be transformed.
14660 if (ScalarVT == MVT::f32 &&
14661 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14662 TII->isInlineConstant(FalseNode->getValueAPF()))
14663 return SDValue();
14664
14665 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14666 if (TrueNodeExpVal == INT_MIN)
14667 return SDValue();
14668 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14669 if (FalseNodeExpVal == INT_MIN)
14670 return SDValue();
14671
14672 SDLoc SL(N);
14673 SDValue SelectNode =
14674 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14675 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14676 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14677
14678 LHS = TrueNode->isNegative()
14679 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14680 : LHS;
14681
14682 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14683 }
14684
14685 return SDValue();
14686}
14687
14688SDValue SITargetLowering::performFMACombine(SDNode *N,
14689 DAGCombinerInfo &DCI) const {
14690 SelectionDAG &DAG = DCI.DAG;
14691 EVT VT = N->getValueType(0);
14692 SDLoc SL(N);
14693
14694 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14695 return SDValue();
14696
14697 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14698 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14699 SDValue Op1 = N->getOperand(0);
14700 SDValue Op2 = N->getOperand(1);
14701 SDValue FMA = N->getOperand(2);
14702
14703 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14704 Op2.getOpcode() != ISD::FP_EXTEND)
14705 return SDValue();
14706
14707 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14708 // regardless of the denorm mode setting. Therefore,
14709 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14710 const TargetOptions &Options = DAG.getTarget().Options;
14711 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14712 (N->getFlags().hasAllowContract() &&
14713 FMA->getFlags().hasAllowContract())) {
14714 Op1 = Op1.getOperand(0);
14715 Op2 = Op2.getOperand(0);
14716 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14718 return SDValue();
14719
14720 SDValue Vec1 = Op1.getOperand(0);
14721 SDValue Idx1 = Op1.getOperand(1);
14722 SDValue Vec2 = Op2.getOperand(0);
14723
14724 SDValue FMAOp1 = FMA.getOperand(0);
14725 SDValue FMAOp2 = FMA.getOperand(1);
14726 SDValue FMAAcc = FMA.getOperand(2);
14727
14728 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14729 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14730 return SDValue();
14731
14732 FMAOp1 = FMAOp1.getOperand(0);
14733 FMAOp2 = FMAOp2.getOperand(0);
14734 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14736 return SDValue();
14737
14738 SDValue Vec3 = FMAOp1.getOperand(0);
14739 SDValue Vec4 = FMAOp2.getOperand(0);
14740 SDValue Idx2 = FMAOp1.getOperand(1);
14741
14742 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14743 // Idx1 and Idx2 cannot be the same.
14744 Idx1 == Idx2)
14745 return SDValue();
14746
14747 if (Vec1 == Vec2 || Vec3 == Vec4)
14748 return SDValue();
14749
14750 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14751 return SDValue();
14752
14753 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14754 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14755 DAG.getTargetConstant(0, SL, MVT::i1));
14756 }
14757 }
14758 return SDValue();
14759}
14760
14761SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14762 DAGCombinerInfo &DCI) const {
14763 SelectionDAG &DAG = DCI.DAG;
14764 SDLoc SL(N);
14765
14766 SDValue LHS = N->getOperand(0);
14767 SDValue RHS = N->getOperand(1);
14768 EVT VT = LHS.getValueType();
14769 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14770
14771 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14772 if (!CRHS) {
14773 CRHS = dyn_cast<ConstantSDNode>(LHS);
14774 if (CRHS) {
14775 std::swap(LHS, RHS);
14777 }
14778 }
14779
14780 if (CRHS) {
14781 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14782 isBoolSGPR(LHS.getOperand(0))) {
14783 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14784 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14785 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14786 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14787 if ((CRHS->isAllOnes() &&
14788 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14789 (CRHS->isZero() &&
14790 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14791 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14792 DAG.getAllOnesConstant(SL, MVT::i1));
14793 if ((CRHS->isAllOnes() &&
14794 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14795 (CRHS->isZero() &&
14796 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14797 return LHS.getOperand(0);
14798 }
14799
14800 const APInt &CRHSVal = CRHS->getAPIntValue();
14801 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14802 LHS.getOpcode() == ISD::SELECT &&
14803 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14804 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14805 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14806 isBoolSGPR(LHS.getOperand(0))) {
14807 // Given CT != FT:
14808 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14809 // setcc (select cc, CT, CF), CF, ne => cc
14810 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14811 // setcc (select cc, CT, CF), CT, eq => cc
14812 const APInt &CT = LHS.getConstantOperandAPInt(1);
14813 const APInt &CF = LHS.getConstantOperandAPInt(2);
14814
14815 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14816 (CT == CRHSVal && CC == ISD::SETNE))
14817 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14818 DAG.getAllOnesConstant(SL, MVT::i1));
14819 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14820 (CT == CRHSVal && CC == ISD::SETEQ))
14821 return LHS.getOperand(0);
14822 }
14823 }
14824
14825 if (VT != MVT::f32 && VT != MVT::f64 &&
14826 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14827 return SDValue();
14828
14829 // Match isinf/isfinite pattern
14830 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14831 // (fcmp one (fabs x), inf) -> (fp_class x,
14832 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14833 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
14834 LHS.getOpcode() == ISD::FABS) {
14835 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14836 if (!CRHS)
14837 return SDValue();
14838
14839 const APFloat &APF = CRHS->getValueAPF();
14840 if (APF.isInfinity() && !APF.isNegative()) {
14841 const unsigned IsInfMask =
14843 const unsigned IsFiniteMask =
14847 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14848 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14849 DAG.getConstant(Mask, SL, MVT::i32));
14850 }
14851 }
14852
14853 return SDValue();
14854}
14855
14856SDValue
14857SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14858 DAGCombinerInfo &DCI) const {
14859 SelectionDAG &DAG = DCI.DAG;
14860 SDLoc SL(N);
14861 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14862
14863 SDValue Src = N->getOperand(0);
14864 SDValue Shift = N->getOperand(0);
14865
14866 // TODO: Extend type shouldn't matter (assuming legal types).
14867 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14868 Shift = Shift.getOperand(0);
14869
14870 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14871 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14872 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14873 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14874 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14875 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14876 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14877 SDValue Shifted = DAG.getZExtOrTrunc(
14878 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
14879
14880 unsigned ShiftOffset = 8 * Offset;
14881 if (Shift.getOpcode() == ISD::SHL)
14882 ShiftOffset -= C->getZExtValue();
14883 else
14884 ShiftOffset += C->getZExtValue();
14885
14886 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14887 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14888 MVT::f32, Shifted);
14889 }
14890 }
14891 }
14892
14893 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14894 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14895 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14896 // We simplified Src. If this node is not dead, visit it again so it is
14897 // folded properly.
14898 if (N->getOpcode() != ISD::DELETED_NODE)
14899 DCI.AddToWorklist(N);
14900 return SDValue(N, 0);
14901 }
14902
14903 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14904 if (SDValue DemandedSrc =
14906 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14907
14908 return SDValue();
14909}
14910
14911SDValue SITargetLowering::performClampCombine(SDNode *N,
14912 DAGCombinerInfo &DCI) const {
14913 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14914 if (!CSrc)
14915 return SDValue();
14916
14917 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14918 const APFloat &F = CSrc->getValueAPF();
14919 APFloat Zero = APFloat::getZero(F.getSemantics());
14920 if (F < Zero ||
14921 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14922 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14923 }
14924
14925 APFloat One(F.getSemantics(), "1.0");
14926 if (F > One)
14927 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14928
14929 return SDValue(CSrc, 0);
14930}
14931
14933 DAGCombinerInfo &DCI) const {
14934 switch (N->getOpcode()) {
14935 case ISD::ADD:
14936 case ISD::SUB:
14937 case ISD::SHL:
14938 case ISD::SRL:
14939 case ISD::SRA:
14940 case ISD::AND:
14941 case ISD::OR:
14942 case ISD::XOR:
14943 case ISD::MUL:
14944 case ISD::SETCC:
14945 case ISD::SELECT:
14946 case ISD::SMIN:
14947 case ISD::SMAX:
14948 case ISD::UMIN:
14949 case ISD::UMAX:
14950 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
14951 return Res;
14952 break;
14953 default:
14954 break;
14955 }
14956
14957 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14958 return SDValue();
14959
14960 switch (N->getOpcode()) {
14961 case ISD::ADD:
14962 return performAddCombine(N, DCI);
14963 case ISD::SUB:
14964 return performSubCombine(N, DCI);
14965 case ISD::UADDO_CARRY:
14966 case ISD::USUBO_CARRY:
14967 return performAddCarrySubCarryCombine(N, DCI);
14968 case ISD::FADD:
14969 return performFAddCombine(N, DCI);
14970 case ISD::FSUB:
14971 return performFSubCombine(N, DCI);
14972 case ISD::FDIV:
14973 return performFDivCombine(N, DCI);
14974 case ISD::FMUL:
14975 return performFMulCombine(N, DCI);
14976 case ISD::SETCC:
14977 return performSetCCCombine(N, DCI);
14978 case ISD::FMAXNUM:
14979 case ISD::FMINNUM:
14980 case ISD::FMAXNUM_IEEE:
14981 case ISD::FMINNUM_IEEE:
14982 case ISD::FMAXIMUM:
14983 case ISD::FMINIMUM:
14984 case ISD::SMAX:
14985 case ISD::SMIN:
14986 case ISD::UMAX:
14987 case ISD::UMIN:
14990 return performMinMaxCombine(N, DCI);
14991 case ISD::FMA:
14992 return performFMACombine(N, DCI);
14993 case ISD::AND:
14994 return performAndCombine(N, DCI);
14995 case ISD::OR:
14996 return performOrCombine(N, DCI);
14997 case ISD::FSHR: {
14999 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15000 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15001 return matchPERM(N, DCI);
15002 }
15003 break;
15004 }
15005 case ISD::XOR:
15006 return performXorCombine(N, DCI);
15007 case ISD::ZERO_EXTEND:
15008 return performZeroExtendCombine(N, DCI);
15010 return performSignExtendInRegCombine(N, DCI);
15012 return performClassCombine(N, DCI);
15013 case ISD::FCANONICALIZE:
15014 return performFCanonicalizeCombine(N, DCI);
15015 case AMDGPUISD::RCP:
15016 return performRcpCombine(N, DCI);
15017 case ISD::FLDEXP:
15018 case AMDGPUISD::FRACT:
15019 case AMDGPUISD::RSQ:
15022 case AMDGPUISD::RSQ_CLAMP: {
15023 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15024 SDValue Src = N->getOperand(0);
15025 if (Src.isUndef())
15026 return Src;
15027 break;
15028 }
15029 case ISD::SINT_TO_FP:
15030 case ISD::UINT_TO_FP:
15031 return performUCharToFloatCombine(N, DCI);
15032 case ISD::FCOPYSIGN:
15033 return performFCopySignCombine(N, DCI);
15038 return performCvtF32UByteNCombine(N, DCI);
15039 case AMDGPUISD::FMED3:
15040 return performFMed3Combine(N, DCI);
15042 return performCvtPkRTZCombine(N, DCI);
15043 case AMDGPUISD::CLAMP:
15044 return performClampCombine(N, DCI);
15045 case ISD::SCALAR_TO_VECTOR: {
15046 SelectionDAG &DAG = DCI.DAG;
15047 EVT VT = N->getValueType(0);
15048
15049 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15050 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15051 SDLoc SL(N);
15052 SDValue Src = N->getOperand(0);
15053 EVT EltVT = Src.getValueType();
15054 if (EltVT != MVT::i16)
15055 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15056
15057 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15058 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15059 }
15060
15061 break;
15062 }
15064 return performExtractVectorEltCombine(N, DCI);
15066 return performInsertVectorEltCombine(N, DCI);
15067 case ISD::FP_ROUND:
15068 return performFPRoundCombine(N, DCI);
15069 case ISD::LOAD: {
15070 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15071 return Widened;
15072 [[fallthrough]];
15073 }
15074 default: {
15075 if (!DCI.isBeforeLegalize()) {
15076 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15077 return performMemSDNodeCombine(MemNode, DCI);
15078 }
15079
15080 break;
15081 }
15082 }
15083
15085}
15086
15087/// Helper function for adjustWritemask
15088static unsigned SubIdx2Lane(unsigned Idx) {
15089 switch (Idx) {
15090 default:
15091 return ~0u;
15092 case AMDGPU::sub0:
15093 return 0;
15094 case AMDGPU::sub1:
15095 return 1;
15096 case AMDGPU::sub2:
15097 return 2;
15098 case AMDGPU::sub3:
15099 return 3;
15100 case AMDGPU::sub4:
15101 return 4; // Possible with TFE/LWE
15102 }
15103}
15104
15105/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15106SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15107 SelectionDAG &DAG) const {
15108 unsigned Opcode = Node->getMachineOpcode();
15109
15110 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15111 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15112 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15113 return Node; // not implemented for D16
15114
15115 SDNode *Users[5] = {nullptr};
15116 unsigned Lane = 0;
15117 unsigned DmaskIdx =
15118 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15119 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15120 unsigned NewDmask = 0;
15121 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15122 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15123 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15124 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15125 ? true
15126 : false;
15127 unsigned TFCLane = 0;
15128 bool HasChain = Node->getNumValues() > 1;
15129
15130 if (OldDmask == 0) {
15131 // These are folded out, but on the chance it happens don't assert.
15132 return Node;
15133 }
15134
15135 unsigned OldBitsSet = llvm::popcount(OldDmask);
15136 // Work out which is the TFE/LWE lane if that is enabled.
15137 if (UsesTFC) {
15138 TFCLane = OldBitsSet;
15139 }
15140
15141 // Try to figure out the used register components
15142 for (SDUse &Use : Node->uses()) {
15143
15144 // Don't look at users of the chain.
15145 if (Use.getResNo() != 0)
15146 continue;
15147
15148 SDNode *User = Use.getUser();
15149
15150 // Abort if we can't understand the usage
15151 if (!User->isMachineOpcode() ||
15152 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15153 return Node;
15154
15155 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15156 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15157 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15158 // set, etc.
15159 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15160 if (Lane == ~0u)
15161 return Node;
15162
15163 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15164 if (UsesTFC && Lane == TFCLane) {
15165 Users[Lane] = User;
15166 } else {
15167 // Set which texture component corresponds to the lane.
15168 unsigned Comp;
15169 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15170 Comp = llvm::countr_zero(Dmask);
15171 Dmask &= ~(1 << Comp);
15172 }
15173
15174 // Abort if we have more than one user per component.
15175 if (Users[Lane])
15176 return Node;
15177
15178 Users[Lane] = User;
15179 NewDmask |= 1 << Comp;
15180 }
15181 }
15182
15183 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15184 bool NoChannels = !NewDmask;
15185 if (NoChannels) {
15186 if (!UsesTFC) {
15187 // No uses of the result and not using TFC. Then do nothing.
15188 return Node;
15189 }
15190 // If the original dmask has one channel - then nothing to do
15191 if (OldBitsSet == 1)
15192 return Node;
15193 // Use an arbitrary dmask - required for the instruction to work
15194 NewDmask = 1;
15195 }
15196 // Abort if there's no change
15197 if (NewDmask == OldDmask)
15198 return Node;
15199
15200 unsigned BitsSet = llvm::popcount(NewDmask);
15201
15202 // Check for TFE or LWE - increase the number of channels by one to account
15203 // for the extra return value
15204 // This will need adjustment for D16 if this is also included in
15205 // adjustWriteMask (this function) but at present D16 are excluded.
15206 unsigned NewChannels = BitsSet + UsesTFC;
15207
15208 int NewOpcode =
15209 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15210 assert(NewOpcode != -1 &&
15211 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15212 "failed to find equivalent MIMG op");
15213
15214 // Adjust the writemask in the node
15216 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15217 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15218 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15219
15220 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15221
15222 MVT ResultVT = NewChannels == 1
15223 ? SVT
15224 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15225 : NewChannels == 5 ? 8
15226 : NewChannels);
15227 SDVTList NewVTList =
15228 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15229
15230 MachineSDNode *NewNode =
15231 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15232
15233 if (HasChain) {
15234 // Update chain.
15235 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15236 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15237 }
15238
15239 if (NewChannels == 1) {
15240 assert(Node->hasNUsesOfValue(1, 0));
15241 SDNode *Copy =
15242 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15243 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15244 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15245 return nullptr;
15246 }
15247
15248 // Update the users of the node with the new indices
15249 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15250 SDNode *User = Users[i];
15251 if (!User) {
15252 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15253 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15254 if (i || !NoChannels)
15255 continue;
15256 } else {
15257 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15258 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15259 if (NewUser != User) {
15260 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15261 DAG.RemoveDeadNode(User);
15262 }
15263 }
15264
15265 switch (Idx) {
15266 default:
15267 break;
15268 case AMDGPU::sub0:
15269 Idx = AMDGPU::sub1;
15270 break;
15271 case AMDGPU::sub1:
15272 Idx = AMDGPU::sub2;
15273 break;
15274 case AMDGPU::sub2:
15275 Idx = AMDGPU::sub3;
15276 break;
15277 case AMDGPU::sub3:
15278 Idx = AMDGPU::sub4;
15279 break;
15280 }
15281 }
15282
15283 DAG.RemoveDeadNode(Node);
15284 return nullptr;
15285}
15286
15288 if (Op.getOpcode() == ISD::AssertZext)
15289 Op = Op.getOperand(0);
15290
15291 return isa<FrameIndexSDNode>(Op);
15292}
15293
15294/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15295/// with frame index operands.
15296/// LLVM assumes that inputs are to these instructions are registers.
15297SDNode *
15299 SelectionDAG &DAG) const {
15300 if (Node->getOpcode() == ISD::CopyToReg) {
15301 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15302 SDValue SrcVal = Node->getOperand(2);
15303
15304 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15305 // to try understanding copies to physical registers.
15306 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15307 SDLoc SL(Node);
15309 SDValue VReg = DAG.getRegister(
15310 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15311
15312 SDNode *Glued = Node->getGluedNode();
15313 SDValue ToVReg = DAG.getCopyToReg(
15314 Node->getOperand(0), SL, VReg, SrcVal,
15315 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15316 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15317 VReg, ToVReg.getValue(1));
15318 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15319 DAG.RemoveDeadNode(Node);
15320 return ToResultReg.getNode();
15321 }
15322 }
15323
15325 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15326 if (!isFrameIndexOp(Node->getOperand(i))) {
15327 Ops.push_back(Node->getOperand(i));
15328 continue;
15329 }
15330
15331 SDLoc DL(Node);
15332 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15333 Node->getOperand(i).getValueType(),
15334 Node->getOperand(i)),
15335 0));
15336 }
15337
15338 return DAG.UpdateNodeOperands(Node, Ops);
15339}
15340
15341/// Fold the instructions after selecting them.
15342/// Returns null if users were already updated.
15344 SelectionDAG &DAG) const {
15346 unsigned Opcode = Node->getMachineOpcode();
15347
15348 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15349 !TII->isGather4(Opcode) &&
15350 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15351 return adjustWritemask(Node, DAG);
15352 }
15353
15354 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15356 return Node;
15357 }
15358
15359 switch (Opcode) {
15360 case AMDGPU::V_DIV_SCALE_F32_e64:
15361 case AMDGPU::V_DIV_SCALE_F64_e64: {
15362 // Satisfy the operand register constraint when one of the inputs is
15363 // undefined. Ordinarily each undef value will have its own implicit_def of
15364 // a vreg, so force these to use a single register.
15365 SDValue Src0 = Node->getOperand(1);
15366 SDValue Src1 = Node->getOperand(3);
15367 SDValue Src2 = Node->getOperand(5);
15368
15369 if ((Src0.isMachineOpcode() &&
15370 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15371 (Src0 == Src1 || Src0 == Src2))
15372 break;
15373
15374 MVT VT = Src0.getValueType().getSimpleVT();
15375 const TargetRegisterClass *RC =
15376 getRegClassFor(VT, Src0.getNode()->isDivergent());
15377
15379 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15380
15381 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15382 Src0, SDValue());
15383
15384 // src0 must be the same register as src1 or src2, even if the value is
15385 // undefined, so make sure we don't violate this constraint.
15386 if (Src0.isMachineOpcode() &&
15387 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15388 if (Src1.isMachineOpcode() &&
15389 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15390 Src0 = Src1;
15391 else if (Src2.isMachineOpcode() &&
15392 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15393 Src0 = Src2;
15394 else {
15395 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15396 Src0 = UndefReg;
15397 Src1 = UndefReg;
15398 }
15399 } else
15400 break;
15401
15402 SmallVector<SDValue, 9> Ops(Node->ops());
15403 Ops[1] = Src0;
15404 Ops[3] = Src1;
15405 Ops[5] = Src2;
15406 Ops.push_back(ImpDef.getValue(1));
15407 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15408 }
15409 default:
15410 break;
15411 }
15412
15413 return Node;
15414}
15415
15416// Any MIMG instructions that use tfe or lwe require an initialization of the
15417// result register that will be written in the case of a memory access failure.
15418// The required code is also added to tie this init code to the result of the
15419// img instruction.
15422 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15423 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15424 MachineBasicBlock &MBB = *MI.getParent();
15425
15426 int DstIdx =
15427 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15428 unsigned InitIdx = 0;
15429
15430 if (TII->isImage(MI)) {
15431 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15432 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15433 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15434
15435 if (!TFE && !LWE) // intersect_ray
15436 return;
15437
15438 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15439 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15440 unsigned D16Val = D16 ? D16->getImm() : 0;
15441
15442 if (!TFEVal && !LWEVal)
15443 return;
15444
15445 // At least one of TFE or LWE are non-zero
15446 // We have to insert a suitable initialization of the result value and
15447 // tie this to the dest of the image instruction.
15448
15449 // Calculate which dword we have to initialize to 0.
15450 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15451
15452 // check that dmask operand is found.
15453 assert(MO_Dmask && "Expected dmask operand in instruction");
15454
15455 unsigned dmask = MO_Dmask->getImm();
15456 // Determine the number of active lanes taking into account the
15457 // Gather4 special case
15458 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15459
15460 bool Packed = !Subtarget->hasUnpackedD16VMem();
15461
15462 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15463
15464 // Abandon attempt if the dst size isn't large enough
15465 // - this is in fact an error but this is picked up elsewhere and
15466 // reported correctly.
15467 uint32_t DstSize =
15468 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15469 if (DstSize < InitIdx)
15470 return;
15471 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15472 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15473 } else {
15474 return;
15475 }
15476
15477 const DebugLoc &DL = MI.getDebugLoc();
15478
15479 // Create a register for the initialization value.
15480 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15481 unsigned NewDst = 0; // Final initialized value will be in here
15482
15483 // If PRTStrictNull feature is enabled (the default) then initialize
15484 // all the result registers to 0, otherwise just the error indication
15485 // register (VGPRn+1)
15486 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15487 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15488
15489 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15490 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15491 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15492 // Initialize dword
15493 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15494 // clang-format off
15495 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15496 .addImm(0);
15497 // clang-format on
15498 // Insert into the super-reg
15499 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15500 .addReg(PrevDst)
15501 .addReg(SubReg)
15503
15504 PrevDst = NewDst;
15505 }
15506
15507 // Add as an implicit operand
15508 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15509
15510 // Tie the just added implicit operand to the dst
15511 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15512}
15513
15514/// Assign the register class depending on the number of
15515/// bits set in the writemask
15517 SDNode *Node) const {
15519
15520 MachineFunction *MF = MI.getParent()->getParent();
15523
15524 if (TII->isVOP3(MI.getOpcode())) {
15525 // Make sure constant bus requirements are respected.
15526 TII->legalizeOperandsVOP3(MRI, MI);
15527
15528 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15529 // This saves a chain-copy of registers and better balance register
15530 // use between vgpr and agpr as agpr tuples tend to be big.
15531 if (!MI.getDesc().operands().empty()) {
15532 unsigned Opc = MI.getOpcode();
15533 bool HasAGPRs = Info->mayNeedAGPRs();
15534 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15535 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15536 for (auto I :
15537 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15538 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15539 if (I == -1)
15540 break;
15541 if ((I == Src2Idx) && (HasAGPRs))
15542 break;
15543 MachineOperand &Op = MI.getOperand(I);
15544 if (!Op.isReg() || !Op.getReg().isVirtual())
15545 continue;
15546 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15547 if (!TRI->hasAGPRs(RC))
15548 continue;
15549 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15550 if (!Src || !Src->isCopy() ||
15551 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15552 continue;
15553 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15554 // All uses of agpr64 and agpr32 can also accept vgpr except for
15555 // v_accvgpr_read, but we do not produce agpr reads during selection,
15556 // so no use checks are needed.
15557 MRI.setRegClass(Op.getReg(), NewRC);
15558 }
15559
15560 if (TII->isMAI(MI)) {
15561 // The ordinary src0, src1, src2 were legalized above.
15562 //
15563 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15564 // as a separate instruction.
15565 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15566 AMDGPU::OpName::scale_src0);
15567 if (Src0Idx != -1) {
15568 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15569 AMDGPU::OpName::scale_src1);
15570 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15571 TII->usesConstantBus(MRI, MI, Src1Idx))
15572 TII->legalizeOpWithMove(MI, Src1Idx);
15573 }
15574 }
15575
15576 if (!HasAGPRs)
15577 return;
15578
15579 // Resolve the rest of AV operands to AGPRs.
15580 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15581 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15582 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15583 if (TRI->isVectorSuperClass(RC)) {
15584 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15585 MRI.setRegClass(Src2->getReg(), NewRC);
15586 if (Src2->isTied())
15587 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15588 }
15589 }
15590 }
15591 }
15592
15593 return;
15594 }
15595
15596 if (TII->isImage(MI))
15597 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15598}
15599
15601 uint64_t Val) {
15602 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15603 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15604}
15605
15607 const SDLoc &DL,
15608 SDValue Ptr) const {
15610
15611 // Build the half of the subregister with the constants before building the
15612 // full 128-bit register. If we are building multiple resource descriptors,
15613 // this will allow CSEing of the 2-component register.
15614 const SDValue Ops0[] = {
15615 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15616 buildSMovImm32(DAG, DL, 0),
15617 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15618 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15619 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15620
15621 SDValue SubRegHi = SDValue(
15622 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15623
15624 // Combine the constants and the pointer.
15625 const SDValue Ops1[] = {
15626 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15627 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15628 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15629
15630 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15631}
15632
15633/// Return a resource descriptor with the 'Add TID' bit enabled
15634/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15635/// of the resource descriptor) to create an offset, which is added to
15636/// the resource pointer.
15638 SDValue Ptr, uint32_t RsrcDword1,
15639 uint64_t RsrcDword2And3) const {
15640 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15641 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15642 if (RsrcDword1) {
15643 PtrHi =
15644 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15645 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15646 0);
15647 }
15648
15649 SDValue DataLo =
15650 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15651 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15652
15653 const SDValue Ops[] = {
15654 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15655 PtrLo,
15656 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15657 PtrHi,
15658 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15659 DataLo,
15660 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15661 DataHi,
15662 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15663
15664 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15665}
15666
15667//===----------------------------------------------------------------------===//
15668// SI Inline Assembly Support
15669//===----------------------------------------------------------------------===//
15670
15671std::pair<unsigned, const TargetRegisterClass *>
15673 StringRef Constraint,
15674 MVT VT) const {
15675 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15676
15677 const TargetRegisterClass *RC = nullptr;
15678 if (Constraint.size() == 1) {
15679 const unsigned BitWidth = VT.getSizeInBits();
15680 switch (Constraint[0]) {
15681 default:
15682 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15683 case 's':
15684 case 'r':
15685 switch (BitWidth) {
15686 case 16:
15687 RC = &AMDGPU::SReg_32RegClass;
15688 break;
15689 case 64:
15690 RC = &AMDGPU::SGPR_64RegClass;
15691 break;
15692 default:
15694 if (!RC)
15695 return std::pair(0U, nullptr);
15696 break;
15697 }
15698 break;
15699 case 'v':
15700 switch (BitWidth) {
15701 case 16:
15702 RC = &AMDGPU::VGPR_32RegClass;
15703 break;
15704 default:
15705 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15706 if (!RC)
15707 return std::pair(0U, nullptr);
15708 break;
15709 }
15710 break;
15711 case 'a':
15712 if (!Subtarget->hasMAIInsts())
15713 break;
15714 switch (BitWidth) {
15715 case 16:
15716 RC = &AMDGPU::AGPR_32RegClass;
15717 break;
15718 default:
15719 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15720 if (!RC)
15721 return std::pair(0U, nullptr);
15722 break;
15723 }
15724 break;
15725 }
15726 // We actually support i128, i16 and f16 as inline parameters
15727 // even if they are not reported as legal
15728 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15729 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15730 return std::pair(0U, RC);
15731 }
15732
15733 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15734 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15735 if (RegName.consume_front("v")) {
15736 RC = &AMDGPU::VGPR_32RegClass;
15737 } else if (RegName.consume_front("s")) {
15738 RC = &AMDGPU::SGPR_32RegClass;
15739 } else if (RegName.consume_front("a")) {
15740 RC = &AMDGPU::AGPR_32RegClass;
15741 }
15742
15743 if (RC) {
15744 uint32_t Idx;
15745 if (RegName.consume_front("[")) {
15746 uint32_t End;
15747 bool Failed = RegName.consumeInteger(10, Idx);
15748 Failed |= !RegName.consume_front(":");
15749 Failed |= RegName.consumeInteger(10, End);
15750 Failed |= !RegName.consume_back("]");
15751 if (!Failed) {
15752 uint32_t Width = (End - Idx + 1) * 32;
15753 // Prohibit constraints for register ranges with a width that does not
15754 // match the required type.
15755 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15756 return std::pair(0U, nullptr);
15757 MCRegister Reg = RC->getRegister(Idx);
15759 RC = TRI->getVGPRClassForBitWidth(Width);
15760 else if (SIRegisterInfo::isSGPRClass(RC))
15761 RC = TRI->getSGPRClassForBitWidth(Width);
15762 else if (SIRegisterInfo::isAGPRClass(RC))
15763 RC = TRI->getAGPRClassForBitWidth(Width);
15764 if (RC) {
15765 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15766 return std::pair(Reg, RC);
15767 }
15768 }
15769 } else {
15770 // Check for lossy scalar/vector conversions.
15771 if (VT.isVector() && VT.getSizeInBits() != 32)
15772 return std::pair(0U, nullptr);
15773 bool Failed = RegName.getAsInteger(10, Idx);
15774 if (!Failed && Idx < RC->getNumRegs())
15775 return std::pair(RC->getRegister(Idx), RC);
15776 }
15777 }
15778 }
15779
15780 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15781 if (Ret.first)
15782 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15783
15784 return Ret;
15785}
15786
15787static bool isImmConstraint(StringRef Constraint) {
15788 if (Constraint.size() == 1) {
15789 switch (Constraint[0]) {
15790 default:
15791 break;
15792 case 'I':
15793 case 'J':
15794 case 'A':
15795 case 'B':
15796 case 'C':
15797 return true;
15798 }
15799 } else if (Constraint == "DA" || Constraint == "DB") {
15800 return true;
15801 }
15802 return false;
15803}
15804
15807 if (Constraint.size() == 1) {
15808 switch (Constraint[0]) {
15809 default:
15810 break;
15811 case 's':
15812 case 'v':
15813 case 'a':
15814 return C_RegisterClass;
15815 }
15816 }
15817 if (isImmConstraint(Constraint)) {
15818 return C_Other;
15819 }
15820 return TargetLowering::getConstraintType(Constraint);
15821}
15822
15823static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15825 Val = Val & maskTrailingOnes<uint64_t>(Size);
15826 }
15827 return Val;
15828}
15829
15831 StringRef Constraint,
15832 std::vector<SDValue> &Ops,
15833 SelectionDAG &DAG) const {
15834 if (isImmConstraint(Constraint)) {
15835 uint64_t Val;
15836 if (getAsmOperandConstVal(Op, Val) &&
15837 checkAsmConstraintVal(Op, Constraint, Val)) {
15838 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15839 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15840 }
15841 } else {
15842 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15843 }
15844}
15845
15847 unsigned Size = Op.getScalarValueSizeInBits();
15848 if (Size > 64)
15849 return false;
15850
15851 if (Size == 16 && !Subtarget->has16BitInsts())
15852 return false;
15853
15854 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15855 Val = C->getSExtValue();
15856 return true;
15857 }
15858 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15859 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15860 return true;
15861 }
15862 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15863 if (Size != 16 || Op.getNumOperands() != 2)
15864 return false;
15865 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15866 return false;
15867 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15868 Val = C->getSExtValue();
15869 return true;
15870 }
15871 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15872 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15873 return true;
15874 }
15875 }
15876
15877 return false;
15878}
15879
15881 uint64_t Val) const {
15882 if (Constraint.size() == 1) {
15883 switch (Constraint[0]) {
15884 case 'I':
15886 case 'J':
15887 return isInt<16>(Val);
15888 case 'A':
15889 return checkAsmConstraintValA(Op, Val);
15890 case 'B':
15891 return isInt<32>(Val);
15892 case 'C':
15893 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15895 default:
15896 break;
15897 }
15898 } else if (Constraint.size() == 2) {
15899 if (Constraint == "DA") {
15900 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15901 int64_t LoBits = static_cast<int32_t>(Val);
15902 return checkAsmConstraintValA(Op, HiBits, 32) &&
15903 checkAsmConstraintValA(Op, LoBits, 32);
15904 }
15905 if (Constraint == "DB") {
15906 return true;
15907 }
15908 }
15909 llvm_unreachable("Invalid asm constraint");
15910}
15911
15913 unsigned MaxSize) const {
15914 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15915 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15916 if (Size == 16) {
15917 MVT VT = Op.getSimpleValueType();
15918 switch (VT.SimpleTy) {
15919 default:
15920 return false;
15921 case MVT::i16:
15922 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15923 case MVT::f16:
15924 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15925 case MVT::bf16:
15926 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15927 case MVT::v2i16:
15928 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15929 case MVT::v2f16:
15930 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15931 case MVT::v2bf16:
15932 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15933 }
15934 }
15935 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15936 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15937 return true;
15938 return false;
15939}
15940
15941static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15942 switch (UnalignedClassID) {
15943 case AMDGPU::VReg_64RegClassID:
15944 return AMDGPU::VReg_64_Align2RegClassID;
15945 case AMDGPU::VReg_96RegClassID:
15946 return AMDGPU::VReg_96_Align2RegClassID;
15947 case AMDGPU::VReg_128RegClassID:
15948 return AMDGPU::VReg_128_Align2RegClassID;
15949 case AMDGPU::VReg_160RegClassID:
15950 return AMDGPU::VReg_160_Align2RegClassID;
15951 case AMDGPU::VReg_192RegClassID:
15952 return AMDGPU::VReg_192_Align2RegClassID;
15953 case AMDGPU::VReg_224RegClassID:
15954 return AMDGPU::VReg_224_Align2RegClassID;
15955 case AMDGPU::VReg_256RegClassID:
15956 return AMDGPU::VReg_256_Align2RegClassID;
15957 case AMDGPU::VReg_288RegClassID:
15958 return AMDGPU::VReg_288_Align2RegClassID;
15959 case AMDGPU::VReg_320RegClassID:
15960 return AMDGPU::VReg_320_Align2RegClassID;
15961 case AMDGPU::VReg_352RegClassID:
15962 return AMDGPU::VReg_352_Align2RegClassID;
15963 case AMDGPU::VReg_384RegClassID:
15964 return AMDGPU::VReg_384_Align2RegClassID;
15965 case AMDGPU::VReg_512RegClassID:
15966 return AMDGPU::VReg_512_Align2RegClassID;
15967 case AMDGPU::VReg_1024RegClassID:
15968 return AMDGPU::VReg_1024_Align2RegClassID;
15969 case AMDGPU::AReg_64RegClassID:
15970 return AMDGPU::AReg_64_Align2RegClassID;
15971 case AMDGPU::AReg_96RegClassID:
15972 return AMDGPU::AReg_96_Align2RegClassID;
15973 case AMDGPU::AReg_128RegClassID:
15974 return AMDGPU::AReg_128_Align2RegClassID;
15975 case AMDGPU::AReg_160RegClassID:
15976 return AMDGPU::AReg_160_Align2RegClassID;
15977 case AMDGPU::AReg_192RegClassID:
15978 return AMDGPU::AReg_192_Align2RegClassID;
15979 case AMDGPU::AReg_256RegClassID:
15980 return AMDGPU::AReg_256_Align2RegClassID;
15981 case AMDGPU::AReg_512RegClassID:
15982 return AMDGPU::AReg_512_Align2RegClassID;
15983 case AMDGPU::AReg_1024RegClassID:
15984 return AMDGPU::AReg_1024_Align2RegClassID;
15985 default:
15986 return -1;
15987 }
15988}
15989
15990// Figure out which registers should be reserved for stack access. Only after
15991// the function is legalized do we know all of the non-spill stack objects or if
15992// calls are present.
15996 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15997 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15998 const SIInstrInfo *TII = ST.getInstrInfo();
15999
16000 if (Info->isEntryFunction()) {
16001 // Callable functions have fixed registers used for stack access.
16003 }
16004
16005 // TODO: Move this logic to getReservedRegs()
16006 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16007 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16008 Register SReg = ST.isWave32()
16009 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16010 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16011 &AMDGPU::SGPR_64RegClass);
16012 Info->setSGPRForEXECCopy(SReg);
16013
16014 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16015 Info->getStackPtrOffsetReg()));
16016 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16017 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16018
16019 // We need to worry about replacing the default register with itself in case
16020 // of MIR testcases missing the MFI.
16021 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16022 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16023
16024 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16025 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16026
16027 Info->limitOccupancy(MF);
16028
16029 if (ST.isWave32() && !MF.empty()) {
16030 for (auto &MBB : MF) {
16031 for (auto &MI : MBB) {
16032 TII->fixImplicitOperands(MI);
16033 }
16034 }
16035 }
16036
16037 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16038 // classes if required. Ideally the register class constraints would differ
16039 // per-subtarget, but there's no easy way to achieve that right now. This is
16040 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16041 // from using them as the register class for legal types.
16042 if (ST.needsAlignedVGPRs()) {
16043 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16044 const Register Reg = Register::index2VirtReg(I);
16045 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16046 if (!RC)
16047 continue;
16048 int NewClassID = getAlignedAGPRClassID(RC->getID());
16049 if (NewClassID != -1)
16050 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16051 }
16052 }
16053
16055}
16056
16058 KnownBits &Known,
16059 const APInt &DemandedElts,
16060 const SelectionDAG &DAG,
16061 unsigned Depth) const {
16062 Known.resetAll();
16063 unsigned Opc = Op.getOpcode();
16064 switch (Opc) {
16066 unsigned IID = Op.getConstantOperandVal(0);
16067 switch (IID) {
16068 case Intrinsic::amdgcn_mbcnt_lo:
16069 case Intrinsic::amdgcn_mbcnt_hi: {
16070 const GCNSubtarget &ST =
16072 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16073 // most 31 + src1.
16074 Known.Zero.setBitsFrom(
16075 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16076 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16077 Known = KnownBits::add(Known, Known2);
16078 return;
16079 }
16080 }
16081 break;
16082 }
16083 }
16085 Op, Known, DemandedElts, DAG, Depth);
16086}
16087
16089 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16091
16092 // Set the high bits to zero based on the maximum allowed scratch size per
16093 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16094 // calculation won't overflow, so assume the sign bit is never set.
16095 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16096}
16097
16099 KnownBits &Known, unsigned Dim) {
16100 unsigned MaxValue =
16101 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16102 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16103}
16104
16106 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16107 const MachineRegisterInfo &MRI, unsigned Depth) const {
16108 const MachineInstr *MI = MRI.getVRegDef(R);
16109 switch (MI->getOpcode()) {
16110 case AMDGPU::G_INTRINSIC:
16111 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16112 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16113 switch (IID) {
16114 case Intrinsic::amdgcn_workitem_id_x:
16115 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16116 break;
16117 case Intrinsic::amdgcn_workitem_id_y:
16118 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16119 break;
16120 case Intrinsic::amdgcn_workitem_id_z:
16121 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16122 break;
16123 case Intrinsic::amdgcn_mbcnt_lo:
16124 case Intrinsic::amdgcn_mbcnt_hi: {
16125 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16126 // most 31 + src1.
16127 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16128 ? getSubtarget()->getWavefrontSizeLog2()
16129 : 5);
16130 KnownBits Known2;
16131 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16132 Depth + 1);
16133 Known = KnownBits::add(Known, Known2);
16134 break;
16135 }
16136 case Intrinsic::amdgcn_groupstaticsize: {
16137 // We can report everything over the maximum size as 0. We can't report
16138 // based on the actual size because we don't know if it's accurate or not
16139 // at any given point.
16140 Known.Zero.setHighBits(
16141 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16142 break;
16143 }
16144 }
16145 break;
16146 }
16147 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16148 Known.Zero.setHighBits(24);
16149 break;
16150 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16151 Known.Zero.setHighBits(16);
16152 break;
16153 case AMDGPU::G_AMDGPU_SMED3:
16154 case AMDGPU::G_AMDGPU_UMED3: {
16155 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16156
16157 KnownBits Known2;
16158 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16159 if (Known2.isUnknown())
16160 break;
16161
16162 KnownBits Known1;
16163 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16164 if (Known1.isUnknown())
16165 break;
16166
16167 KnownBits Known0;
16168 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16169 if (Known0.isUnknown())
16170 break;
16171
16172 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16173 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16174 Known.One = Known0.One & Known1.One & Known2.One;
16175 break;
16176 }
16177 }
16178}
16179
16182 unsigned Depth) const {
16183 const MachineInstr *MI = MRI.getVRegDef(R);
16184 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16185 // FIXME: Can this move to generic code? What about the case where the call
16186 // site specifies a lower alignment?
16187 Intrinsic::ID IID = GI->getIntrinsicID();
16189 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16190 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16191 return *RetAlign;
16192 }
16193 return Align(1);
16194}
16195
16198 const Align CacheLineAlign = Align(64);
16199
16200 // Pre-GFX10 target did not benefit from loop alignment
16201 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16202 getSubtarget()->hasInstFwdPrefetchBug())
16203 return PrefAlign;
16204
16205 // On GFX10 I$ is 4 x 64 bytes cache lines.
16206 // By default prefetcher keeps one cache line behind and reads two ahead.
16207 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16208 // behind and one ahead.
16209 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16210 // If loop fits 64 bytes it always spans no more than two cache lines and
16211 // does not need an alignment.
16212 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16213 // Else if loop is less or equal 192 bytes we need two lines behind.
16214
16216 const MachineBasicBlock *Header = ML->getHeader();
16217 if (Header->getAlignment() != PrefAlign)
16218 return Header->getAlignment(); // Already processed.
16219
16220 unsigned LoopSize = 0;
16221 for (const MachineBasicBlock *MBB : ML->blocks()) {
16222 // If inner loop block is aligned assume in average half of the alignment
16223 // size to be added as nops.
16224 if (MBB != Header)
16225 LoopSize += MBB->getAlignment().value() / 2;
16226
16227 for (const MachineInstr &MI : *MBB) {
16228 LoopSize += TII->getInstSizeInBytes(MI);
16229 if (LoopSize > 192)
16230 return PrefAlign;
16231 }
16232 }
16233
16234 if (LoopSize <= 64)
16235 return PrefAlign;
16236
16237 if (LoopSize <= 128)
16238 return CacheLineAlign;
16239
16240 // If any of parent loops is surrounded by prefetch instructions do not
16241 // insert new for inner loop, which would reset parent's settings.
16242 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16243 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16244 auto I = Exit->getFirstNonDebugInstr();
16245 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16246 return CacheLineAlign;
16247 }
16248 }
16249
16250 MachineBasicBlock *Pre = ML->getLoopPreheader();
16251 MachineBasicBlock *Exit = ML->getExitBlock();
16252
16253 if (Pre && Exit) {
16254 auto PreTerm = Pre->getFirstTerminator();
16255 if (PreTerm == Pre->begin() ||
16256 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16257 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16258 .addImm(1); // prefetch 2 lines behind PC
16259
16260 auto ExitHead = Exit->getFirstNonDebugInstr();
16261 if (ExitHead == Exit->end() ||
16262 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16263 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16264 .addImm(2); // prefetch 1 line behind PC
16265 }
16266
16267 return CacheLineAlign;
16268}
16269
16271static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16272 assert(N->getOpcode() == ISD::CopyFromReg);
16273 do {
16274 // Follow the chain until we find an INLINEASM node.
16275 N = N->getOperand(0).getNode();
16276 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16277 return true;
16278 } while (N->getOpcode() == ISD::CopyFromReg);
16279 return false;
16280}
16281
16284 UniformityInfo *UA) const {
16285 switch (N->getOpcode()) {
16286 case ISD::CopyFromReg: {
16287 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16288 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16289 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16290 Register Reg = R->getReg();
16291
16292 // FIXME: Why does this need to consider isLiveIn?
16293 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16294 return !TRI->isSGPRReg(MRI, Reg);
16295
16296 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16297 return UA->isDivergent(V);
16298
16299 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16300 return !TRI->isSGPRReg(MRI, Reg);
16301 }
16302 case ISD::LOAD: {
16303 const LoadSDNode *L = cast<LoadSDNode>(N);
16304 unsigned AS = L->getAddressSpace();
16305 // A flat load may access private memory.
16307 }
16308 case ISD::CALLSEQ_END:
16309 return true;
16311 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16313 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16332 // Target-specific read-modify-write atomics are sources of divergence.
16333 return true;
16334 default:
16335 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16336 // Generic read-modify-write atomics are sources of divergence.
16337 return A->readMem() && A->writeMem();
16338 }
16339 return false;
16340 }
16341}
16342
16344 EVT VT) const {
16345 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16346 case MVT::f32:
16348 case MVT::f64:
16349 case MVT::f16:
16351 default:
16352 return false;
16353 }
16354}
16355
16357 LLT Ty, const MachineFunction &MF) const {
16358 switch (Ty.getScalarSizeInBits()) {
16359 case 32:
16360 return !denormalModeIsFlushAllF32(MF);
16361 case 64:
16362 case 16:
16363 return !denormalModeIsFlushAllF64F16(MF);
16364 default:
16365 return false;
16366 }
16367}
16368
16370 const SelectionDAG &DAG,
16371 bool SNaN,
16372 unsigned Depth) const {
16373 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16374 const MachineFunction &MF = DAG.getMachineFunction();
16376
16377 if (Info->getMode().DX10Clamp)
16378 return true; // Clamped to 0.
16379 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16380 }
16381
16383 Depth);
16384}
16385
16386// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16387// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16389 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16390 return true;
16391
16393 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16394 if (DenormMode == DenormalMode::getPreserveSign())
16395 return true;
16396
16397 // TODO: Remove this.
16398 return RMW->getFunction()
16399 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16400 .getValueAsBool();
16401}
16402
16404 LLVMContext &Ctx = RMW->getContext();
16405 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16406 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16407
16408 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16409 << "Hardware instruction generated for atomic "
16410 << RMW->getOperationName(RMW->getOperation())
16411 << " operation at memory scope " << MemScope;
16412}
16413
16414static bool isV2F16OrV2BF16(Type *Ty) {
16415 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16416 Type *EltTy = VT->getElementType();
16417 return VT->getNumElements() == 2 &&
16418 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16419 }
16420
16421 return false;
16422}
16423
16424static bool isV2F16(Type *Ty) {
16425 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16426 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16427}
16428
16429static bool isV2BF16(Type *Ty) {
16430 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16431 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16432}
16433
16434/// \return true if atomicrmw integer ops work for the type.
16435static bool isAtomicRMWLegalIntTy(Type *Ty) {
16436 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16437 unsigned BW = IT->getBitWidth();
16438 return BW == 32 || BW == 64;
16439 }
16440
16441 return false;
16442}
16443
16444/// \return true if this atomicrmw xchg type can be selected.
16445static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16446 Type *Ty = RMW->getType();
16447 if (isAtomicRMWLegalIntTy(Ty))
16448 return true;
16449
16450 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16451 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16452 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16453 return BW == 32 || BW == 64;
16454 }
16455
16456 if (Ty->isFloatTy() || Ty->isDoubleTy())
16457 return true;
16458
16459 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16460 return VT->getNumElements() == 2 &&
16461 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16462 }
16463
16464 return false;
16465}
16466
16467/// \returns true if it's valid to emit a native instruction for \p RMW, based
16468/// on the properties of the target memory.
16469static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16470 const AtomicRMWInst *RMW,
16471 bool HasSystemScope) {
16472 // The remote/fine-grained access logic is different from the integer
16473 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16474 // fine-grained access does not work, even for a device local allocation.
16475 //
16476 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16477 // allocations work.
16478 if (HasSystemScope) {
16480 RMW->hasMetadata("amdgpu.no.remote.memory"))
16481 return true;
16483 return true;
16484
16485 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16486}
16487
16488/// \return Action to perform on AtomicRMWInsts for integer operations.
16491 return isAtomicRMWLegalIntTy(RMW->getType())
16494}
16495
16496/// Return if a flat address space atomicrmw can access private memory.
16498 const MDNode *NoaliasAddrSpaceMD =
16499 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16500 if (!NoaliasAddrSpaceMD)
16501 return true;
16502
16503 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16504 ++I) {
16505 auto *Low = mdconst::extract<ConstantInt>(
16506 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16507 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16508 auto *High = mdconst::extract<ConstantInt>(
16509 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16510 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16511 }
16512 }
16513
16514 return true;
16515}
16516
16519 unsigned AS = RMW->getPointerAddressSpace();
16520 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16522
16523 // 64-bit flat atomics that dynamically reside in private memory will silently
16524 // be dropped.
16525 //
16526 // Note that we will emit a new copy of the original atomic in the expansion,
16527 // which will be incrementally relegalized.
16528 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16529 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16530 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16533
16534 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16536 ORE.emit([=]() {
16537 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16538 });
16539 return Kind;
16540 };
16541
16542 auto SSID = RMW->getSyncScopeID();
16543 bool HasSystemScope =
16544 SSID == SyncScope::System ||
16545 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16546
16547 auto Op = RMW->getOperation();
16548 switch (Op) {
16549 case AtomicRMWInst::Xchg: {
16550 // PCIe supports add and xchg for system atomics.
16551 return isAtomicRMWLegalXChgTy(RMW)
16554 }
16555 case AtomicRMWInst::Add:
16556 case AtomicRMWInst::And:
16560 case AtomicRMWInst::Sub:
16561 case AtomicRMWInst::Or:
16562 case AtomicRMWInst::Xor: {
16563 // Atomic sub/or/xor do not work over PCI express, but atomic add
16564 // does. InstCombine transforms these with 0 to or, so undo that.
16565 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16566 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16567 ConstVal && ConstVal->isNullValue())
16569 }
16570
16572 }
16573 case AtomicRMWInst::FAdd: {
16574 Type *Ty = RMW->getType();
16575
16576 // TODO: Handle REGION_ADDRESS
16577 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16578 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16579 // is fixed to round-to-nearest-even.
16580 //
16581 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16582 // round-to-nearest-even.
16583 //
16584 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16585 // suggests it is OK if the floating-point mode may not match the calling
16586 // thread.
16587 if (Ty->isFloatTy()) {
16590 }
16591
16592 if (Ty->isDoubleTy()) {
16593 // Ignores denormal mode, but we don't consider flushing mandatory.
16596 }
16597
16598 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16600
16602 }
16603
16604 // LDS atomics respect the denormal mode from the mode register.
16605 //
16606 // Traditionally f32 global/buffer memory atomics would unconditionally
16607 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16608 // flush.
16609 //
16610 // On targets with flat atomic fadd, denormals would flush depending on
16611 // whether the target address resides in LDS or global memory. We consider
16612 // this flat-maybe-flush as will-flush.
16613 if (Ty->isFloatTy() &&
16617
16618 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16619 // safe. The message phrasing also should be better.
16620 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16621 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16622 // gfx940, gfx12
16623 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16624 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16625 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16626 // gfx90a, gfx940, gfx12
16627 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16628 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16629
16630 // gfx940, gfx12
16631 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16632 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16633 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16634 // gfx90a, gfx940, gfx12
16635 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16636 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16637
16638 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16639 // buffer. gfx12 does have the buffer version.
16640 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16641 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16642 }
16643
16644 // global and flat atomic fadd f64: gfx90a, gfx940.
16645 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16646 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16647
16648 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16649 if (Ty->isFloatTy()) {
16650 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16651 // gfx11+.
16652 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16653 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16654 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16655 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16656 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16657 } else {
16658 // gfx908
16659 if (RMW->use_empty() &&
16661 isV2F16(Ty))
16662 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16663 }
16664 }
16665
16666 // flat atomic fadd f32: gfx940, gfx11+.
16667 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16668 if (Subtarget->hasFlatAtomicFaddF32Inst())
16669 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16670
16671 // If it is in flat address space, and the type is float, we will try to
16672 // expand it, if the target supports global and lds atomic fadd. The
16673 // reason we need that is, in the expansion, we emit the check of
16674 // address space. If it is in global address space, we emit the global
16675 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16676 // fadd.
16677 if (Subtarget->hasLDSFPAtomicAddF32()) {
16678 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16680 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16682 }
16683 }
16684 }
16685
16687 }
16689 case AtomicRMWInst::FMax: {
16690 Type *Ty = RMW->getType();
16691
16692 // LDS float and double fmin/fmax were always supported.
16693 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16694 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16696 }
16697
16698 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16699 // For flat and global cases:
16700 // float, double in gfx7. Manual claims denormal support.
16701 // Removed in gfx8.
16702 // float, double restored in gfx10.
16703 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16704 //
16705 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16706 // no f32.
16707 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16708 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16709 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16710 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16711 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16712 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16714 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16715 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16716 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16717 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16718 }
16719 }
16720
16722 }
16723 case AtomicRMWInst::Min:
16724 case AtomicRMWInst::Max:
16726 case AtomicRMWInst::UMax: {
16729 // Always expand system scope min/max atomics.
16730 if (HasSystemScope)
16732 }
16733
16735 }
16738 default:
16740 }
16741
16742 llvm_unreachable("covered atomicrmw op switch");
16743}
16744
16750}
16751
16754 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16757}
16758
16761 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16762 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16764
16765 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16767
16768 const DataLayout &DL = CmpX->getDataLayout();
16769
16770 Type *ValTy = CmpX->getNewValOperand()->getType();
16771
16772 // If a 64-bit flat atomic may alias private, we need to avoid using the
16773 // atomic in the private case.
16774 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16776}
16777
16778const TargetRegisterClass *
16779SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16781 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16782 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16783 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16784 : &AMDGPU::SReg_32RegClass;
16785 if (!TRI->isSGPRClass(RC) && !isDivergent)
16786 return TRI->getEquivalentSGPRClass(RC);
16787 if (TRI->isSGPRClass(RC) && isDivergent)
16788 return TRI->getEquivalentVGPRClass(RC);
16789
16790 return RC;
16791}
16792
16793// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16794// uniform values (as produced by the mask results of control flow intrinsics)
16795// used outside of divergent blocks. The phi users need to also be treated as
16796// always uniform.
16797//
16798// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16799static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16800 unsigned WaveSize) {
16801 // FIXME: We assume we never cast the mask results of a control flow
16802 // intrinsic.
16803 // Early exit if the type won't be consistent as a compile time hack.
16804 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16805 if (!IT || IT->getBitWidth() != WaveSize)
16806 return false;
16807
16808 if (!isa<Instruction>(V))
16809 return false;
16810 if (!Visited.insert(V).second)
16811 return false;
16812 bool Result = false;
16813 for (const auto *U : V->users()) {
16814 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16815 if (V == U->getOperand(1)) {
16816 switch (Intrinsic->getIntrinsicID()) {
16817 default:
16818 Result = false;
16819 break;
16820 case Intrinsic::amdgcn_if_break:
16821 case Intrinsic::amdgcn_if:
16822 case Intrinsic::amdgcn_else:
16823 Result = true;
16824 break;
16825 }
16826 }
16827 if (V == U->getOperand(0)) {
16828 switch (Intrinsic->getIntrinsicID()) {
16829 default:
16830 Result = false;
16831 break;
16832 case Intrinsic::amdgcn_end_cf:
16833 case Intrinsic::amdgcn_loop:
16834 Result = true;
16835 break;
16836 }
16837 }
16838 } else {
16839 Result = hasCFUser(U, Visited, WaveSize);
16840 }
16841 if (Result)
16842 break;
16843 }
16844 return Result;
16845}
16846
16848 const Value *V) const {
16849 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16850 if (CI->isInlineAsm()) {
16851 // FIXME: This cannot give a correct answer. This should only trigger in
16852 // the case where inline asm returns mixed SGPR and VGPR results, used
16853 // outside the defining block. We don't have a specific result to
16854 // consider, so this assumes if any value is SGPR, the overall register
16855 // also needs to be SGPR.
16856 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16858 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16859 for (auto &TC : TargetConstraints) {
16860 if (TC.Type == InlineAsm::isOutput) {
16862 const TargetRegisterClass *RC =
16863 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
16864 TC.ConstraintVT)
16865 .second;
16866 if (RC && SIRI->isSGPRClass(RC))
16867 return true;
16868 }
16869 }
16870 }
16871 }
16873 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16874}
16875
16877 for (SDUse &Use : N->uses()) {
16878 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
16879 if (getBasePtrIndex(M) == Use.getOperandNo())
16880 return true;
16881 }
16882 }
16883 return false;
16884}
16885
16887 SDValue N1) const {
16888 if (!N0.hasOneUse())
16889 return false;
16890 // Take care of the opportunity to keep N0 uniform
16891 if (N0->isDivergent() || !N1->isDivergent())
16892 return true;
16893 // Check if we have a good chance to form the memory access pattern with the
16894 // base and offset
16895 return (DAG.isBaseWithConstantOffset(N0) &&
16897}
16898
16900 Register N0, Register N1) const {
16901 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16902}
16903
16906 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16908 if (I.getMetadata("amdgpu.noclobber"))
16909 Flags |= MONoClobber;
16910 if (I.getMetadata("amdgpu.last.use"))
16911 Flags |= MOLastUse;
16912 return Flags;
16913}
16914
16916 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16917 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16918 if (User->getOpcode() != ISD::CopyToReg)
16919 return false;
16920 if (!Def->isMachineOpcode())
16921 return false;
16922 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16923 if (!MDef)
16924 return false;
16925
16926 unsigned ResNo = User->getOperand(Op).getResNo();
16927 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16928 return false;
16929 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16930 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16931 PhysReg = AMDGPU::SCC;
16932 const TargetRegisterClass *RC =
16933 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16934 Cost = RC->getCopyCost();
16935 return true;
16936 }
16937 return false;
16938}
16939
16941 Instruction *AI) const {
16942 // Given: atomicrmw fadd ptr %addr, float %val ordering
16943 //
16944 // With this expansion we produce the following code:
16945 // [...]
16946 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16947 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16948 //
16949 // atomicrmw.shared:
16950 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16951 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16952 // float %val ordering
16953 // br label %atomicrmw.phi
16954 //
16955 // atomicrmw.check.private:
16956 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16957 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16958 //
16959 // atomicrmw.private:
16960 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16961 // %loaded.private = load float, ptr addrspace(5) %cast.private
16962 // %val.new = fadd float %loaded.private, %val
16963 // store float %val.new, ptr addrspace(5) %cast.private
16964 // br label %atomicrmw.phi
16965 //
16966 // atomicrmw.global:
16967 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16968 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16969 // float %val ordering
16970 // br label %atomicrmw.phi
16971 //
16972 // atomicrmw.phi:
16973 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16974 // [ %loaded.private, %atomicrmw.private ],
16975 // [ %loaded.global, %atomicrmw.global ]
16976 // br label %atomicrmw.end
16977 //
16978 // atomicrmw.end:
16979 // [...]
16980 //
16981 //
16982 // For 64-bit atomics which may reside in private memory, we perform a simpler
16983 // version that only inserts the private check, and uses the flat operation.
16984
16985 IRBuilder<> Builder(AI);
16986 LLVMContext &Ctx = Builder.getContext();
16987
16988 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16989 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16991 Value *Addr = AI->getOperand(PtrOpIdx);
16992
16993 /// TODO: Only need to check private, then emit flat-known-not private (no
16994 /// need for shared block, or cast to global).
16995 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16996
16997 Align Alignment;
16998 if (RMW)
16999 Alignment = RMW->getAlign();
17000 else if (CX)
17001 Alignment = CX->getAlign();
17002 else
17003 llvm_unreachable("unhandled atomic operation");
17004
17005 // FullFlatEmulation is true if we need to issue the private, shared, and
17006 // global cases.
17007 //
17008 // If this is false, we are only dealing with the flat-targeting-private case,
17009 // where we only insert a check for private and still use the flat instruction
17010 // for global and shared.
17011
17012 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17013 Subtarget->hasAtomicFaddInsts() &&
17014 RMW->getType()->isFloatTy();
17015
17016 // If the return value isn't used, do not introduce a false use in the phi.
17017 bool ReturnValueIsUsed = !AI->use_empty();
17018
17019 BasicBlock *BB = Builder.GetInsertBlock();
17020 Function *F = BB->getParent();
17021 BasicBlock *ExitBB =
17022 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17023 BasicBlock *SharedBB = nullptr;
17024
17025 BasicBlock *CheckPrivateBB = BB;
17026 if (FullFlatEmulation) {
17027 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17028 CheckPrivateBB =
17029 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17030 }
17031
17032 BasicBlock *PrivateBB =
17033 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17034 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17035 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17036
17037 std::prev(BB->end())->eraseFromParent();
17038 Builder.SetInsertPoint(BB);
17039
17040 Value *LoadedShared = nullptr;
17041 if (FullFlatEmulation) {
17042 CallInst *IsShared = Builder.CreateIntrinsic(
17043 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17044 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17045 Builder.SetInsertPoint(SharedBB);
17046 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17048
17049 Instruction *Clone = AI->clone();
17050 Clone->insertInto(SharedBB, SharedBB->end());
17051 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17052 LoadedShared = Clone;
17053
17054 Builder.CreateBr(PhiBB);
17055 Builder.SetInsertPoint(CheckPrivateBB);
17056 }
17057
17058 CallInst *IsPrivate = Builder.CreateIntrinsic(
17059 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17060 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17061
17062 Builder.SetInsertPoint(PrivateBB);
17063
17064 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17066
17067 Value *LoadedPrivate;
17068 if (RMW) {
17069 LoadedPrivate = Builder.CreateAlignedLoad(
17070 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17071
17072 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17073 LoadedPrivate, RMW->getValOperand());
17074
17075 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17076 } else {
17077 auto [ResultLoad, Equal] =
17078 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17079 CX->getNewValOperand(), CX->getAlign());
17080
17081 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17082 ResultLoad, 0);
17083 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17084 }
17085
17086 Builder.CreateBr(PhiBB);
17087
17088 Builder.SetInsertPoint(GlobalBB);
17089
17090 // Continue using a flat instruction if we only emitted the check for private.
17091 Instruction *LoadedGlobal = AI;
17092 if (FullFlatEmulation) {
17093 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17095 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17096 }
17097
17098 AI->removeFromParent();
17099 AI->insertInto(GlobalBB, GlobalBB->end());
17100
17101 // The new atomicrmw may go through another round of legalization later.
17102 if (!FullFlatEmulation) {
17103 // We inserted the runtime check already, make sure we do not try to
17104 // re-expand this.
17105 // TODO: Should union with any existing metadata.
17106 MDBuilder MDB(F->getContext());
17107 MDNode *RangeNotPrivate =
17110 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17111 RangeNotPrivate);
17112 }
17113
17114 Builder.CreateBr(PhiBB);
17115
17116 Builder.SetInsertPoint(PhiBB);
17117
17118 if (ReturnValueIsUsed) {
17119 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17120 AI->replaceAllUsesWith(Loaded);
17121 if (FullFlatEmulation)
17122 Loaded->addIncoming(LoadedShared, SharedBB);
17123 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17124 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17125 Loaded->takeName(AI);
17126 }
17127
17128 Builder.CreateBr(ExitBB);
17129}
17130
17133
17136 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17137 ConstVal && ConstVal->isNullValue()) {
17138 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17140
17141 // We may still need the private-alias-flat handling below.
17142
17143 // TODO: Skip this for cases where we cannot access remote memory.
17144 }
17145 }
17146
17147 // The non-flat expansions should only perform the de-canonicalization of
17148 // identity values.
17150 return;
17151
17153}
17154
17157}
17158
17159LoadInst *
17161 IRBuilder<> Builder(AI);
17162 auto Order = AI->getOrdering();
17163
17164 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17165 // must be flushed if the atomic ordering had a release semantics. This is
17166 // not necessary a fence, a release fence just coincides to do that flush.
17167 // Avoid replacing of an atomicrmw with a release semantics.
17168 if (isReleaseOrStronger(Order))
17169 return nullptr;
17170
17171 LoadInst *LI = Builder.CreateAlignedLoad(
17172 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17173 LI->setAtomic(Order, AI->getSyncScopeID());
17174 LI->copyMetadata(*AI);
17175 LI->takeName(AI);
17176 AI->replaceAllUsesWith(LI);
17177 AI->eraseFromParent();
17178 return LI;
17179}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1117
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5465
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1484
bool isNegative() const
Definition: APFloat.h:1440
APInt bitcastToAPInt() const
Definition: APFloat.h:1346
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1135
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1095
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1076
bool isInfinity() const
Definition: APFloat.h:1437
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1459
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
unsigned arg_size() const
Definition: InstrTypes.h:1292
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2554
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1830
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1144
LLVMContext & getContext() const
Definition: IRBuilder.h:173
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1138
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1849
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2160
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:78
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:748
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:980
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:575
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:799
const Pass * getPass() const
Definition: SelectionDAG.h:491
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:501
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:854
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:825
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:495
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:710
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:496
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:698
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:490
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:871
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:508
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:584
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:578
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1490
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1646
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1613
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1593
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1600
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:235
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:297
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:263
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals